Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / intrapred8.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* Copyright (C) 2013 x265 project
3;*
4;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5;*
6;* This program is free software; you can redistribute it and/or modify
7;* it under the terms of the GNU General Public License as published by
8;* the Free Software Foundation; either version 2 of the License, or
9;* (at your option) any later version.
10;*
11;* This program is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14;* GNU General Public License for more details.
15;*
16;* You should have received a copy of the GNU General Public License
17;* along with this program; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19;*
20;* This program is also available under a commercial proprietary license.
21;* For more information, contact us at license @ x265.com.
22;*****************************************************************************/
23
24%include "x86inc.asm"
25%include "x86util.asm"
26
27SECTION_RODATA 32
28
29pb_0_8 times 8 db 0, 8
30pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
31pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0
32c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
33tab_Si: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
34pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
35c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0
36c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
37c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0
38c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15
39c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
40c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0
41c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0
42c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
43c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
44c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
45c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
46tab_S1: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
47pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
48c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
49c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
50c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
51c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2
52c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2
53c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1
54c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
55tab_S2: db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
56
57const ang_table
58%assign x 0
59%rep 32
60 times 8 db (32-x), x
61%assign x x+1
62%endrep
63
64SECTION .text
65
66cextern pw_8
67cextern pw_1024
68cextern pb_unpackbd1
69cextern multiL
70cextern multiH
71cextern multiH2
72cextern multiH3
73cextern multi_2Row
74
75;-----------------------------------------------------------------------------
76; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
77;-----------------------------------------------------------------------------
78INIT_XMM sse4
79cglobal intra_pred_dc4, 4,6,3
80 mov r4d, r5m
81 inc r2
82 inc r3
83 pxor m0, m0
84 movd m1, [r2]
85 movd m2, [r3]
86 punpckldq m1, m2
87 psadbw m1, m0 ; m1 = sum
88
89 test r4d, r4d
90
91 mov r4d, 4096
92 movd m2, r4d
93 pmulhrsw m1, m2 ; m1 = (sum + 4) / 8
94 movd r4d, m1 ; r4d = dc_val
95 pshufb m1, m0 ; m1 = byte [dc_val ...]
96
97 ; store DC 4x4
98 lea r5, [r1 * 3]
99 movd [r0], m1
100 movd [r0 + r1], m1
101 movd [r0 + r1 * 2], m1
102 movd [r0 + r5], m1
103
104 ; do DC filter
105 jz .end
106 lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
107 add r4d, r5d ; r4d = DC * 3 + 2
108 movd m1, r4d
109 pshuflw m1, m1, 0 ; m1 = pixDCx3
110
111 ; filter top
112 pmovzxbw m2, [r3]
113 paddw m2, m1
114 psraw m2, 2
115 packuswb m2, m2
116 movd [r0], m2 ; overwrite top-left pixel, we will update it later
117
118 ; filter top-left
119 movzx r3d, byte [r3]
120 add r5d, r3d
121 movzx r3d, byte [r2]
122 add r3d, r5d
123 shr r3d, 2
124 mov [r0], r3b
125
126 ; filter left
127 add r0, r1
128 pmovzxbw m2, [r2 + 1]
129 paddw m2, m1
130 psraw m2, 2
131 packuswb m2, m2
132 pextrb [r0], m2, 0
133 pextrb [r0 + r1], m2, 1
134 pextrb [r0 + r1 * 2], m2, 2
135
136.end:
137 RET
138
139
140;-------------------------------------------------------------------------------------------
141; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
142;-------------------------------------------------------------------------------------------
143INIT_XMM sse4
144cglobal intra_pred_dc8, 4, 7, 3
145 mov r4d, r5m
146 inc r2
147 inc r3
148 pxor m0, m0
149 movh m1, [r2]
150 movh m2, [r3]
151 punpcklqdq m1, m2
152 psadbw m1, m0
153 pshufd m2, m1, 2
154 paddw m1, m2
155
156 movd r5d, m1
157 add r5d, 8
158 shr r5d, 4 ; sum = sum / 16
159 movd m1, r5d
160 pshufb m1, m0 ; m1 = byte [dc_val ...]
161
162 test r4d, r4d
163
164 ; store DC 8x8
165 mov r6, r0
166 movh [r0], m1
167 movh [r0 + r1], m1
168 lea r0, [r0 + r1 * 2]
169 movh [r0], m1
170 movh [r0 + r1], m1
171 lea r0, [r0 + r1 * 2]
172 movh [r0], m1
173 movh [r0 + r1], m1
174 lea r0, [r0 + r1 * 2]
175 movh [r0], m1
176 movh [r0 + r1], m1
177
178 ; Do DC Filter
179 jz .end
180 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
181 add r5d, r4d ; r5d = DC * 3 + 2
182 movd m1, r5d
183 pshuflw m1, m1, 0 ; m1 = pixDCx3
184 pshufd m1, m1, 0
185
186 ; filter top
187 pmovzxbw m2, [r3]
188 paddw m2, m1
189 psraw m2, 2
190 packuswb m2, m2
191 movh [r6], m2
192
193 ; filter top-left
194 movzx r3d, byte [r3]
195 add r4d, r3d
196 movzx r3d, byte [r2]
197 add r3d, r4d
198 shr r3d, 2
199 mov [r6], r3b
200
201 ; filter left
202 add r6, r1
203 pmovzxbw m2, [r2 + 1]
204 paddw m2, m1
205 psraw m2, 2
206 packuswb m2, m2
207 pextrb [r6], m2, 0
208 pextrb [r6 + r1], m2, 1
209 pextrb [r6 + 2 * r1], m2, 2
210 lea r6, [r6 + r1 * 2]
211 pextrb [r6 + r1], m2, 3
212 pextrb [r6 + r1 * 2], m2, 4
213 pextrb [r6 + r1 * 4], m2, 6
214 lea r1, [r1 * 3]
215 pextrb [r6 + r1], m2, 5
216
217.end:
218 RET
219
220;-------------------------------------------------------------------------------------------
221; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
222;-------------------------------------------------------------------------------------------
223INIT_XMM sse4
224cglobal intra_pred_dc16, 5, 7, 4
225 mov r4d, r5m
226 inc r2
227 inc r3
228 pxor m0, m0
229 movu m1, [r2]
230 movu m2, [r3]
231 psadbw m1, m0
232 psadbw m2, m0
233 paddw m1, m2
234 pshufd m2, m1, 2
235 paddw m1, m2
236
237 movd r5d, m1
238 add r5d, 16
239 shr r5d, 5 ; sum = sum / 32
240 movd m1, r5d
241 pshufb m1, m0 ; m1 = byte [dc_val ...]
242
243 test r4d, r4d
244
245 ; store DC 16x16
246 mov r6, r0
247 movu [r0], m1
248 movu [r0 + r1], m1
249 lea r0, [r0 + r1 * 2]
250 movu [r0], m1
251 movu [r0 + r1], m1
252 lea r0, [r0 + r1 * 2]
253 movu [r0], m1
254 movu [r0 + r1], m1
255 lea r0, [r0 + r1 * 2]
256 movu [r0], m1
257 movu [r0 + r1], m1
258 lea r0, [r0 + r1 * 2]
259 movu [r0], m1
260 movu [r0 + r1], m1
261 lea r0, [r0 + r1 * 2]
262 movu [r0], m1
263 movu [r0 + r1], m1
264 lea r0, [r0 + r1 * 2]
265 movu [r0], m1
266 movu [r0 + r1], m1
267 lea r0, [r0 + r1 * 2]
268 movu [r0], m1
269 movu [r0 + r1], m1
270
271 ; Do DC Filter
272 jz .end
273 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
274 add r5d, r4d ; r5d = DC * 3 + 2
275 movd m1, r5d
276 pshuflw m1, m1, 0 ; m1 = pixDCx3
277 pshufd m1, m1, 0
278
279 ; filter top
280 pmovzxbw m2, [r3]
281 paddw m2, m1
282 psraw m2, 2
283 packuswb m2, m2
284 movh [r6], m2
285 pmovzxbw m3, [r3 + 8]
286 paddw m3, m1
287 psraw m3, 2
288 packuswb m3, m3
289 movh [r6 + 8], m3
290
291 ; filter top-left
292 movzx r3d, byte [r3]
293 add r4d, r3d
294 movzx r3d, byte [r2]
295 add r3d, r4d
296 shr r3d, 2
297 mov [r6], r3b
298
299 ; filter left
300 add r6, r1
301 pmovzxbw m2, [r2 + 1]
302 paddw m2, m1
303 psraw m2, 2
304 packuswb m2, m2
305 pextrb [r6], m2, 0
306 pextrb [r6 + r1], m2, 1
307 pextrb [r6 + r1 * 2], m2, 2
308 lea r6, [r6 + r1 * 2]
309 pextrb [r6 + r1], m2, 3
310 pextrb [r6 + r1 * 2], m2, 4
311 lea r6, [r6 + r1 * 2]
312 pextrb [r6 + r1], m2, 5
313 pextrb [r6 + r1 * 2], m2, 6
314 lea r6, [r6 + r1 * 2]
315 pextrb [r6 + r1], m2, 7
316
317 pmovzxbw m3, [r2 + 9]
318 paddw m3, m1
319 psraw m3, 2
320 packuswb m3, m3
321 pextrb [r6 + r1 * 2], m3, 0
322 lea r6, [r6 + r1 * 2]
323 pextrb [r6 + r1], m3, 1
324 pextrb [r6 + r1 * 2], m3, 2
325 lea r6, [r6 + r1 * 2]
326 pextrb [r6 + r1], m3, 3
327 pextrb [r6 + r1 * 2], m3, 4
328 lea r6, [r6 + r1 * 2]
329 pextrb [r6 + r1], m3, 5
330 pextrb [r6 + r1 * 2], m3, 6
331
332.end:
333 RET
334
335;-------------------------------------------------------------------------------------------
336; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
337;-------------------------------------------------------------------------------------------
338INIT_XMM sse4
339cglobal intra_pred_dc32, 4, 5, 5
340 inc r2
341 inc r3
342 pxor m0, m0
343 movu m1, [r2]
344 movu m2, [r2 + 16]
345 movu m3, [r3]
346 movu m4, [r3 + 16]
347 psadbw m1, m0
348 psadbw m2, m0
349 psadbw m3, m0
350 psadbw m4, m0
351 paddw m1, m2
352 paddw m3, m4
353 paddw m1, m3
354 pshufd m2, m1, 2
355 paddw m1, m2
356
357 movd r4d, m1
358 add r4d, 32
359 shr r4d, 6 ; sum = sum / 64
360 movd m1, r4d
361 pshufb m1, m0 ; m1 = byte [dc_val ...]
362
363%rep 2
364 ; store DC 16x16
365 movu [r0], m1
366 movu [r0 + r1], m1
367 movu [r0 + 16], m1
368 movu [r0 + r1 + 16],m1
369 lea r0, [r0 + 2 * r1]
370 movu [r0], m1
371 movu [r0 + r1], m1
372 movu [r0 + 16], m1
373 movu [r0 + r1 + 16],m1
374 lea r0, [r0 + 2 * r1]
375 movu [r0], m1
376 movu [r0 + r1], m1
377 movu [r0 + 16], m1
378 movu [r0 + r1 + 16],m1
379 lea r0, [r0 + 2 * r1]
380 movu [r0], m1
381 movu [r0 + r1], m1
382 movu [r0 + 16], m1
383 movu [r0 + r1 + 16],m1
384 lea r0, [r0 + 2 * r1]
385 movu [r0], m1
386 movu [r0 + r1], m1
387 movu [r0 + 16], m1
388 movu [r0 + r1 + 16],m1
389 lea r0, [r0 + 2 * r1]
390 movu [r0], m1
391 movu [r0 + r1], m1
392 movu [r0 + 16], m1
393 movu [r0 + r1 + 16],m1
394 lea r0, [r0 + 2 * r1]
395 movu [r0], m1
396 movu [r0 + r1], m1
397 movu [r0 + 16], m1
398 movu [r0 + r1 + 16],m1
399 lea r0, [r0 + 2 * r1]
400 movu [r0], m1
401 movu [r0 + r1], m1
402 movu [r0 + 16], m1
403 movu [r0 + r1 + 16],m1
404 lea r0, [r0 + 2 * r1]
405%endrep
406
407 RET
408
409;-----------------------------------------------------------------------------------------------------------
410; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
411;-----------------------------------------------------------------------------------------------------------
412INIT_XMM sse4
413cglobal intra_pred_planar4, 4,7,5
414 inc r2
415 inc r3
416 pmovzxbw m0, [r3] ; topRow[i] = above[i];
417 punpcklqdq m0, m0
418
419 pxor m1, m1
420 movd m2, [r2 + 4] ; bottomLeft = left[4]
421 movzx r6d, byte [r3 + 4] ; topRight = above[4];
422 pshufb m2, m1
423 punpcklbw m2, m1
424 psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i]
425 psllw m0, 2
426 punpcklqdq m3, m2, m1
427 psubw m0, m3
428 paddw m2, m2
429
430%macro COMP_PRED_PLANAR_2ROW 1
431 movzx r4d, byte [r2 + %1]
432 lea r4d, [r4d * 4 + 4]
433 movd m3, r4d
434 pshuflw m3, m3, 0
435
436 movzx r4d, byte [r2 + %1 + 1]
437 lea r4d, [r4d * 4 + 4]
438 movd m4, r4d
439 pshuflw m4, m4, 0
440 punpcklqdq m3, m4 ; horPred
441
442 movzx r4d, byte [r2 + %1]
443 mov r5d, r6d
444 sub r5d, r4d
445 movd m4, r5d
446 pshuflw m4, m4, 0
447
448 movzx r4d, byte [r2 + %1 + 1]
449 mov r5d, r6d
450 sub r5d, r4d
451 movd m1, r5d
452 pshuflw m1, m1, 0
453 punpcklqdq m4, m1 ; rightColumnN
454
455 pmullw m4, [multi_2Row]
456 paddw m3, m4
457 paddw m0, m2
458 paddw m3, m0
459 psraw m3, 3
460 packuswb m3, m3
461
462 movd [r0], m3
463 pshufd m3, m3, 0x55
464 movd [r0 + r1], m3
465 lea r0, [r0 + 2 * r1]
466%endmacro
467
468 COMP_PRED_PLANAR_2ROW 0
469 COMP_PRED_PLANAR_2ROW 2
470
471 RET
472
473;-----------------------------------------------------------------------------------------------------------
474; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
475;-----------------------------------------------------------------------------------------------------------
476INIT_XMM sse4
477cglobal intra_pred_planar8, 4,4,7
478 inc r2
479 inc r3
480 pxor m0, m0
481 pmovzxbw m1, [r3] ; v_topRow
482 pmovzxbw m2, [r2] ; v_leftColumn
483
484 movd m3, [r3 + 8] ; topRight = above[8];
485 movd m4, [r2 + 8] ; bottomLeft = left[8];
486
487 pshufb m3, m0
488 pshufb m4, m0
489 punpcklbw m3, m0 ; v_topRight
490 punpcklbw m4, m0 ; v_bottomLeft
491
492 psubw m4, m1 ; v_bottomRow
493 psubw m3, m2 ; v_rightColumn
494
495 psllw m1, 3 ; v_topRow
496 psllw m2, 3 ; v_leftColumn
497
498 paddw m6, m2, [pw_8]
499
500%macro PRED_PLANAR_ROW8 1
501 %if (%1 < 4)
502 pshuflw m5, m6, 0x55 * %1
503 pshufd m5, m5, 0
504 pshuflw m2, m3, 0x55 * %1
505 pshufd m2, m2, 0
506 %else
507 pshufhw m5, m6, 0x55 * (%1 - 4)
508 pshufd m5, m5, 0xAA
509 pshufhw m2, m3, 0x55 * (%1 - 4)
510 pshufd m2, m2, 0xAA
511 %endif
512
513 pmullw m2, [multiL]
514 paddw m5, m2
515 paddw m1, m4
516 paddw m5, m1
517 psraw m5, 4
518 packuswb m5, m5
519
520 movh [r0], m5
521 lea r0, [r0 + r1]
522
523%endmacro
524
525 PRED_PLANAR_ROW8 0
526 PRED_PLANAR_ROW8 1
527 PRED_PLANAR_ROW8 2
528 PRED_PLANAR_ROW8 3
529 PRED_PLANAR_ROW8 4
530 PRED_PLANAR_ROW8 5
531 PRED_PLANAR_ROW8 6
532 PRED_PLANAR_ROW8 7
533
534 RET
535
536
537;-----------------------------------------------------------------------------------------------------------
538; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
539;-----------------------------------------------------------------------------------------------------------
540INIT_XMM sse4
541cglobal intra_pred_planar16, 4,6,8
542 inc r2
543 inc r3
544 pxor m0, m0
545 pmovzxbw m1, [r3] ; topRow[0-7]
546 pmovzxbw m2, [r3 + 8] ; topRow[8-15]
547
548 movd m3, [r2 + 16]
549 pshufb m3, m0
550 punpcklbw m3, m0 ; v_bottomLeft = left[16]
551 movzx r4d, byte [r3 + 16] ; topRight = above[16]
552
553 psubw m4, m3, m1 ; v_bottomRow[0]
554 psubw m5, m3, m2 ; v_bottomRow[1]
555
556 psllw m1, 4
557 psllw m2, 4
558
559%macro PRED_PLANAR_ROW16 1
560 movzx r5d, byte [r2 + %1]
561 add r5d, r5d
562 lea r5d, [r5d * 8 + 16]
563 movd m3, r5d
564 pshuflw m3, m3, 0
565 pshufd m3, m3, 0 ; horPred
566
567 movzx r5d, byte [r2 + %1]
568 mov r3d, r4d
569 sub r3d, r5d
570 movd m6, r3d
571 pshuflw m6, m6, 0
572 pshufd m6, m6, 0
573
574 pmullw m7, m6, [multiL]
575 paddw m7, m3
576 paddw m1, m4
577 paddw m7, m1
578 psraw m7, 5
579
580 pmullw m6, m6, [multiH]
581 paddw m3, m6
582 paddw m2, m5
583 paddw m3, m2
584 psraw m3, 5
585
586 packuswb m7, m3
587 movu [r0], m7
588 lea r0, [r0 + r1]
589%endmacro
590
591 PRED_PLANAR_ROW16 0
592 PRED_PLANAR_ROW16 1
593 PRED_PLANAR_ROW16 2
594 PRED_PLANAR_ROW16 3
595 PRED_PLANAR_ROW16 4
596 PRED_PLANAR_ROW16 5
597 PRED_PLANAR_ROW16 6
598 PRED_PLANAR_ROW16 7
599 PRED_PLANAR_ROW16 8
600 PRED_PLANAR_ROW16 9
601 PRED_PLANAR_ROW16 10
602 PRED_PLANAR_ROW16 11
603 PRED_PLANAR_ROW16 12
604 PRED_PLANAR_ROW16 13
605 PRED_PLANAR_ROW16 14
606 PRED_PLANAR_ROW16 15
607
608 RET
609
610
611;-----------------------------------------------------------------------------------------------------------
612; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
613;-----------------------------------------------------------------------------------------------------------
614INIT_XMM sse4
615%if ARCH_X86_64 == 1
616cglobal intra_pred_planar32, 4,7,12
617 %define bottomRow0 m8
618 %define bottomRow1 m9
619 %define bottomRow2 m10
620 %define bottomRow3 m11
621%else
622cglobal intra_pred_planar32, 4,7,8,0-(4*mmsize)
623 %define bottomRow0 [rsp + 0 * mmsize]
624 %define bottomRow1 [rsp + 1 * mmsize]
625 %define bottomRow2 [rsp + 2 * mmsize]
626 %define bottomRow3 [rsp + 3 * mmsize]
627%endif
628 inc r2
629 inc r3
630 pxor m3, m3
631 movd m0, [r2 + 32]
632 pshufb m0, m3
633 punpcklbw m0, m3 ; v_bottomLeft = left[32]
634 movzx r4d, byte [r3 + 32] ; topRight = above[32]
635
636 pmovzxbw m1, [r3 + 0] ; topRow[0]
637 pmovzxbw m2, [r3 + 8] ; topRow[1]
638 pmovzxbw m3, [r3 +16] ; topRow[2]
639 pmovzxbw m4, [r3 +24] ; topRow[3]
640
641 psubw m5, m0, m1 ; v_bottomRow[0]
642 psubw m6, m0, m2 ; v_bottomRow[1]
643 psubw m7, m0, m3 ; v_bottomRow[2]
644 psubw m0, m4 ; v_bottomRow[3]
645
646 mova bottomRow0, m5
647 mova bottomRow1, m6
648 mova bottomRow2, m7
649 mova bottomRow3, m0
650
651 psllw m1, 5
652 psllw m2, 5
653 psllw m3, 5
654 psllw m4, 5
655
656%macro COMP_PRED_PLANAR_ROW 1
657 movzx r5d, byte [r2]
658 shl r5d, 5
659 add r5d, 32
660 movd m5, r5d
661 pshuflw m5, m5, 0
662 pshufd m5, m5, 0 ; horPred
663
664 movzx r5d, byte [r2]
665 mov r6d, r4d
666 sub r6d, r5d
667 movd m6, r6d
668 pshuflw m6, m6, 0
669 pshufd m6, m6, 0
670
671%if (%1 == 0)
672 pmullw m7, m6, [multiL]
673%else
674 pmullw m7, m6, [multiH2]
675%endif
676
677 paddw m7, m5
678%if (%1 == 0)
679 paddw m1, bottomRow0
680 paddw m7, m1
681%else
682 paddw m3, bottomRow2
683 paddw m7, m3
684%endif
685 psraw m7, 6
686
687%if (%1 == 0)
688 pmullw m6, [multiH]
689%else
690 pmullw m6, [multiH3]
691%endif
692 paddw m6, m5
693%if (%1 == 0)
694 paddw m2, bottomRow1
695 paddw m6, m2
696%else
697 paddw m4, bottomRow3
698 paddw m6, m4
699%endif
700 psraw m6, 6
701
702 packuswb m7, m6
703 movu [r0 + %1], m7
704%endmacro
705
706 mov r3, 32
707.loop:
708 COMP_PRED_PLANAR_ROW 0
709 COMP_PRED_PLANAR_ROW 16
710 inc r2
711 lea r0, [r0 + r1]
712
713 dec r3
714 jnz .loop
715%undef COMP_PRED_PLANAR_ROW
716
717 RET
718
719;-----------------------------------------------------------------------------
720; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
721;-----------------------------------------------------------------------------
722INIT_XMM ssse3
723cglobal intra_pred_ang4_2, 3,3,4
724 cmp r4m, byte 34
725 cmove r2, r3mp
726 movh m0, [r2 + 2]
727 movd [r0], m0
728 palignr m1, m0, 1
729 movd [r0 + r1], m1
730 palignr m2, m0, 2
731 movd [r0 + r1 * 2], m2
732 lea r1, [r1 * 3]
733 psrldq m0, 3
734 movd [r0 + r1], m0
735 RET
736
737
738INIT_XMM sse4
739cglobal intra_pred_ang4_3, 3,4,5
740 cmp r4m, byte 33
741 cmove r2, r3mp
742 lea r3, [ang_table + 20 * 16]
743 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
744 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
745 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
746 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
747 palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
748 palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4]
749 punpcklqdq m0, m1
750 punpcklqdq m2, m3
751
752 movh m3, [r3 + 6 * 16] ; [26]
753 movhps m3, [r3] ; [20]
754 movh m4, [r3 - 6 * 16] ; [14]
755 movhps m4, [r3 - 12 * 16] ; [ 8]
756 jmp .do_filter4x4
757
758 ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
759ALIGN 16
760.do_filter4x4:
761 mova m1, [pw_1024]
762
763 pmaddubsw m0, m3
764 pmulhrsw m0, m1
765 pmaddubsw m2, m4
766 pmulhrsw m2, m1
767 packuswb m0, m2
768
769 ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before
770 jz .store
771
772 ; transpose 4x4
773 pshufb m0, [c_trans_4x4]
774
775.store:
776 ; TODO: use pextrd here after intrinsic ssse3 removed
777 movd [r0], m0
778 pextrd [r0 + r1], m0, 1
779 pextrd [r0 + r1 * 2], m0, 2
780 lea r1, [r1 * 3]
781 pextrd [r0 + r1], m0, 3
782 RET
783
784
785cglobal intra_pred_ang4_4, 3,4,5
786 cmp r4m, byte 32
787 cmove r2, r3mp
788 lea r3, [ang_table + 18 * 16]
789 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
790 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
791 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
792 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
793 palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
794 punpcklqdq m0, m1
795 punpcklqdq m2, m1, m3
796
797 movh m3, [r3 + 3 * 16] ; [21]
798 movhps m3, [r3 - 8 * 16] ; [10]
799 movh m4, [r3 + 13 * 16] ; [31]
800 movhps m4, [r3 + 2 * 16] ; [20]
801 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
802
803
804cglobal intra_pred_ang4_5, 3,4,5
805 cmp r4m, byte 31
806 cmove r2, r3mp
807 lea r3, [ang_table + 10 * 16]
808 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
809 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
810 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
811 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
812 palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
813 punpcklqdq m0, m1
814 punpcklqdq m2, m1, m3
815
816 movh m3, [r3 + 7 * 16] ; [17]
817 movhps m3, [r3 - 8 * 16] ; [ 2]
818 movh m4, [r3 + 9 * 16] ; [19]
819 movhps m4, [r3 - 6 * 16] ; [ 4]
820 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
821
822
823cglobal intra_pred_ang4_6, 3,4,5
824 cmp r4m, byte 30
825 cmove r2, r3mp
826 lea r3, [ang_table + 19 * 16]
827 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
828 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
829 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
830 palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
831 punpcklqdq m0, m0
832 punpcklqdq m2, m2
833
834 movh m3, [r3 - 6 * 16] ; [13]
835 movhps m3, [r3 + 7 * 16] ; [26]
836 movh m4, [r3 - 12 * 16] ; [ 7]
837 movhps m4, [r3 + 1 * 16] ; [20]
838 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
839
840
841cglobal intra_pred_ang4_7, 3,4,5
842 cmp r4m, byte 29
843 cmove r2, r3mp
844 lea r3, [ang_table + 20 * 16]
845 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
846 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
847 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
848 palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
849 punpcklqdq m2, m0, m3
850 punpcklqdq m0, m0
851
852 movh m3, [r3 - 11 * 16] ; [ 9]
853 movhps m3, [r3 - 2 * 16] ; [18]
854 movh m4, [r3 + 7 * 16] ; [27]
855 movhps m4, [r3 - 16 * 16] ; [ 4]
856 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
857
858
859cglobal intra_pred_ang4_8, 3,4,5
860 cmp r4m, byte 28
861 cmove r2, r3mp
862 lea r3, [ang_table + 13 * 16]
863 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
864 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
865 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
866 punpcklqdq m0, m0
867 mova m2, m0
868
869 movh m3, [r3 - 8 * 16] ; [ 5]
870 movhps m3, [r3 - 3 * 16] ; [10]
871 movh m4, [r3 + 2 * 16] ; [15]
872 movhps m4, [r3 + 7 * 16] ; [20]
873 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
874
875
876cglobal intra_pred_ang4_9, 3,4,5
877 cmp r4m, byte 27
878 cmove r2, r3mp
879 lea r3, [ang_table + 4 * 16]
880 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
881 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
882 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
883 punpcklqdq m0, m0
884 mova m2, m0
885
886 movh m3, [r3 - 2 * 16] ; [ 2]
887 movhps m3, [r3 - 0 * 16] ; [ 4]
888 movh m4, [r3 + 2 * 16] ; [ 6]
889 movhps m4, [r3 + 4 * 16] ; [ 8]
890 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
891
892
893cglobal intra_pred_ang4_10, 3,3,4
894 movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
895 pshufb m0, [pb_unpackbd1]
896
897 pshufd m1, m0, 1
898 movhlps m2, m0
899 pshufd m3, m0, 3
900 movd [r0 + r1], m1
901 movd [r0 + r1 * 2], m2
902 lea r1, [r1 * 3]
903 movd [r0 + r1], m3
904
905 cmp r5m, byte 0
906 jz .quit
907
908 ; filter
909 mov r2, r3mp
910 pmovzxbw m0, m0 ; [-1 -1 -1 -1]
911 movh m1, [r2] ; [4 3 2 1 0]
912 pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
913 pshufb m1, [pb_unpackbw1] ; [4 3 2 1]
914 psubw m1, m2
915 psraw m1, 1
916 paddw m0, m1
917 packuswb m0, m0
918
919.quit:
920 movd [r0], m0
921 RET
922
923
924INIT_XMM sse4
925cglobal intra_pred_ang4_26, 4,4,3
926 movd m0, [r3 + 1] ; [8 7 6 5 4 3 2 1]
927
928 ; store
929 movd [r0], m0
930 movd [r0 + r1], m0
931 movd [r0 + r1 * 2], m0
932 lea r3, [r1 * 3]
933 movd [r0 + r3], m0
934
935 ; filter
936 cmp r5m, byte 0
937 jz .quit
938
939 pshufb m0, [pb_0_8] ; [ 1 1 1 1]
940 movh m1, [r2] ; [-4 -3 -2 -1 0]
941 pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
942 pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1]
943 psubw m1, m2
944 psraw m1, 1
945 paddw m0, m1
946 packuswb m0, m0
947
948 pextrb [r0], m0, 0
949 pextrb [r0 + r1], m0, 1
950 pextrb [r0 + r1 * 2], m0, 2
951 pextrb [r0 + r3], m0, 3
952
953.quit:
954 RET
955
956
957cglobal intra_pred_ang4_11, 3,4,5
958 cmp r4m, byte 25
959 cmove r2, r3mp
960 lea r3, [ang_table + 24 * 16]
961 movh m0, [r2] ; [x x x 4 3 2 1 0]
962 palignr m1, m0, 1 ; [x x x x 4 3 2 1]
963 punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
964 punpcklqdq m0, m0
965 mova m2, m0
966
967 movh m3, [r3 + 6 * 16] ; [24]
968 movhps m3, [r3 + 4 * 16] ; [26]
969 movh m4, [r3 + 2 * 16] ; [28]
970 movhps m4, [r3 + 0 * 16] ; [30]
971 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
972
973
974cglobal intra_pred_ang4_12, 3,4,5
975 cmp r4m, byte 24
976 cmove r2, r3mp
977 lea r3, [ang_table + 20 * 16]
978 movh m0, [r2] ; [x x x 4 3 2 1 0]
979 palignr m1, m0, 1 ; [x x x x 4 3 2 1]
980 punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
981 punpcklqdq m0, m0
982 mova m2, m0
983
984 movh m3, [r3 + 7 * 16] ; [27]
985 movhps m3, [r3 + 2 * 16] ; [22]
986 movh m4, [r3 - 3 * 16] ; [17]
987 movhps m4, [r3 - 8 * 16] ; [12]
988 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
989
990
991cglobal intra_pred_ang4_13, 4,4,5
992 cmp r4m, byte 23
993 jnz .load
994 xchg r2, r3
995.load:
996 movh m1, [r2 - 1] ; [x x 4 3 2 1 0 x]
997 palignr m0, m1, 1 ; [x x x 4 3 2 1 0]
998 palignr m2, m1, 2 ; [x x x x 4 3 2 1]
999 pinsrb m1, [r3 + 4], 0
1000 punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x]
1001 punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0]
1002 punpcklqdq m2, m0, m1
1003 punpcklqdq m0, m0
1004
1005 lea r3, [ang_table + 21 * 16]
1006 movh m3, [r3 + 2 * 16] ; [23]
1007 movhps m3, [r3 - 7 * 16] ; [14]
1008 movh m4, [r3 - 16 * 16] ; [ 5]
1009 movhps m4, [r3 + 7 * 16] ; [28]
1010 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1011
1012
1013cglobal intra_pred_ang4_14, 4,4,5
1014 cmp r4m, byte 22
1015 jnz .load
1016 xchg r2, r3
1017.load:
1018 movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
1019 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
1020 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1021 pinsrb m2, [r3 + 2], 0
1022 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
1023 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1024 punpcklqdq m0, m0
1025 punpcklqdq m2, m2
1026
1027 lea r3, [ang_table + 19 * 16]
1028 movh m3, [r3 + 0 * 16] ; [19]
1029 movhps m3, [r3 - 13 * 16] ; [ 6]
1030 movh m4, [r3 + 6 * 16] ; [25]
1031 movhps m4, [r3 - 7 * 16] ; [12]
1032 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1033
1034
1035cglobal intra_pred_ang4_15, 4,4,5
1036 cmp r4m, byte 21
1037 jnz .load
1038 xchg r2, r3
1039.load:
1040 movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
1041 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
1042 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1043 pinsrb m2, [r3 + 2], 0
1044 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
1045 pinsrb m3, [r3 + 4], 0
1046 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
1047 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
1048 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1049 punpcklqdq m0, m2
1050 punpcklqdq m2, m4
1051
1052 lea r3, [ang_table + 23 * 16]
1053 movh m3, [r3 - 8 * 16] ; [15]
1054 movhps m3, [r3 + 7 * 16] ; [30]
1055 movh m4, [r3 - 10 * 16] ; [13]
1056 movhps m4, [r3 + 5 * 16] ; [28]
1057 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1058
1059
1060cglobal intra_pred_ang4_16, 4,4,5
1061 cmp r4m, byte 20
1062 jnz .load
1063 xchg r2, r3
1064.load:
1065 movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
1066 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
1067 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1068 pinsrb m2, [r3 + 2], 0
1069 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
1070 pinsrb m3, [r3 + 3], 0
1071 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
1072 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
1073 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1074 punpcklqdq m0, m2
1075 punpcklqdq m2, m4
1076
1077 lea r3, [ang_table + 19 * 16]
1078 movh m3, [r3 - 8 * 16] ; [11]
1079 movhps m3, [r3 + 3 * 16] ; [22]
1080 movh m4, [r3 - 18 * 16] ; [ 1]
1081 movhps m4, [r3 - 7 * 16] ; [12]
1082 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1083
1084
1085cglobal intra_pred_ang4_17, 4,4,5
1086 cmp r4m, byte 19
1087 jnz .load
1088 xchg r2, r3
1089.load:
1090 movh m3, [r2 - 1] ; [- - 4 3 2 1 0 x]
1091 palignr m0, m3, 1 ; [- - - 4 3 2 1 0]
1092 palignr m1, m3, 2 ; [- - - - 4 3 2 1]
1093 mova m4, m0
1094 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1095
1096 pinsrb m3, [r3 + 1], 0
1097 punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x]
1098 punpcklqdq m0, m1
1099
1100 pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y]
1101 pinsrb m2, [r3 + 2], 0
1102 pslldq m1, m2, 1 ; [4 3 2 1 0 x y z]
1103 pinsrb m1, [r3 + 4], 0
1104 punpcklbw m1, m2 ; [1 0 0 x x y y z]
1105 punpcklbw m2, m3 ; [2 1 1 0 0 x x y]
1106 punpcklqdq m2, m1
1107
1108 lea r3, [ang_table + 14 * 16]
1109 movh m3, [r3 - 8 * 16] ; [ 6]
1110 movhps m3, [r3 - 2 * 16] ; [12]
1111 movh m4, [r3 + 4 * 16] ; [18]
1112 movhps m4, [r3 + 10 * 16] ; [24]
1113 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1114
1115
1116cglobal intra_pred_ang4_18, 4,4,1
1117 mov r2d, [r2]
1118 bswap r2d
1119 movd m0, r2d
1120 pinsrd m0, [r3 + 1], 1 ; [- 3 2 1 0 -1 -2 -3]
1121 lea r2, [r1 * 3]
1122 movd [r0 + r2], m0
1123 psrldq m0, 1
1124 movd [r0 + r1 * 2], m0
1125 psrldq m0, 1
1126 movd [r0 + r1], m0
1127 psrldq m0, 1
1128 movd [r0], m0
1129 RET
1130;-----------------------------------------------------------------------------
1131; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
1132;-----------------------------------------------------------------------------
1133INIT_XMM ssse3
1134cglobal intra_pred_ang8_2, 3,5,2
1135 cmp r4m, byte 34
1136 cmove r2, r3mp
1137 movu m0, [r2 + 2]
1138 lea r4, [r1 * 3]
1139
1140 movh [r0], m0
1141 palignr m1, m0, 1
1142 movh [r0 + r1], m1
1143 palignr m1, m0, 2
1144 movh [r0 + r1 * 2], m1
1145 palignr m1, m0, 3
1146 movh [r0 + r4], m1
1147 palignr m1, m0, 4
1148 lea r0, [r0 + r1 * 4]
1149 movh [r0], m1
1150 palignr m1, m0, 5
1151 movh [r0 + r1], m1
1152 palignr m1, m0, 6
1153 movh [r0 + r1 * 2], m1
1154 palignr m1, m0, 7
1155 movh [r0 + r4], m1
1156 RET
1157
1158INIT_XMM sse4
1159cglobal intra_pred_ang8_3, 3,5,8
1160 cmp r4m, byte 33
1161 cmove r2, r3mp
1162 lea r3, [ang_table + 22 * 16]
1163 lea r4, [ang_table + 8 * 16]
1164 mova m3, [pw_1024]
1165
1166 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1167 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1168
1169 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1170 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1171 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1172
1173 pmaddubsw m4, m0, [r3 + 4 * 16] ; [26]
1174 pmulhrsw m4, m3
1175 pmaddubsw m1, [r3 - 2 * 16] ; [20]
1176 pmulhrsw m1, m3
1177 packuswb m4, m1
1178
1179 palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1180
1181 pmaddubsw m5, [r3 - 8 * 16] ; [14]
1182 pmulhrsw m5, m3
1183
1184 palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1185
1186 pmaddubsw m6, [r4] ; [ 8]
1187 pmulhrsw m6, m3
1188 packuswb m5, m6
1189
1190 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
1191
1192 pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2]
1193 pmulhrsw m6, m3
1194
1195 pmaddubsw m1, [r3 + 6 * 16] ; [28]
1196 pmulhrsw m1, m3
1197 packuswb m6, m1
1198
1199 palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
1200
1201 pmaddubsw m1, [r3] ; [22]
1202 pmulhrsw m1, m3
1203
1204 palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
1205
1206 pmaddubsw m2, [r3 - 6 * 16] ; [16]
1207 pmulhrsw m2, m3
1208 packuswb m1, m2
1209 jmp .transpose8x8
1210
1211ALIGN 16
1212.transpose8x8:
1213 jz .store
1214
1215 ; transpose 8x8
1216 punpckhbw m0, m4, m5
1217 punpcklbw m4, m5
1218 punpckhbw m2, m4, m0
1219 punpcklbw m4, m0
1220
1221 punpckhbw m0, m6, m1
1222 punpcklbw m6, m1
1223 punpckhbw m1, m6, m0
1224 punpcklbw m6, m0
1225
1226 punpckhdq m5, m4, m6
1227 punpckldq m4, m6
1228 punpckldq m6, m2, m1
1229 punpckhdq m2, m1
1230 mova m1, m2
1231
1232.store:
1233 lea r4, [r1 * 3]
1234 movh [r0], m4
1235 movhps [r0 + r1], m4
1236 movh [r0 + r1 * 2], m5
1237 movhps [r0 + r4], m5
1238 add r0, r4
1239 movh [r0 + r1], m6
1240 movhps [r0 + r1 * 2], m6
1241 movh [r0 + r4], m1
1242 movhps [r0 + r1 * 4], m1
1243 RET
1244
1245cglobal intra_pred_ang8_4, 3,5,8
1246 cmp r4m, byte 32
1247 cmove r2, r3mp
1248 lea r3, [ang_table + 24 * 16]
1249 lea r4, [ang_table + 10 * 16]
1250 mova m3, [pw_1024]
1251
1252 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1253 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1254
1255 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1256 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1257 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1258 mova m5, m1
1259
1260 pmaddubsw m4, m0, [r3 - 3 * 16] ; [21]
1261 pmulhrsw m4, m3
1262 pmaddubsw m1, [r4] ; [10]
1263 pmulhrsw m1, m3
1264 packuswb m4, m1
1265
1266 pmaddubsw m5, [r3 + 7 * 16] ; [31]
1267 pmulhrsw m5, m3
1268
1269 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1270
1271 pmaddubsw m6, [r3 - 4 * 16] ; [ 20]
1272 pmulhrsw m6, m3
1273 packuswb m5, m6
1274
1275 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1276
1277 pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9]
1278 pmulhrsw m6, m3
1279
1280 pmaddubsw m1, [r3 + 6 * 16] ; [30]
1281 pmulhrsw m1, m3
1282 packuswb m6, m1
1283
1284 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
1285
1286 pmaddubsw m1, [r3 - 5 * 16] ; [19]
1287 pmulhrsw m1, m3
1288
1289 palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8]
1290
1291 pmaddubsw m2, [r4 - 2 * 16] ; [8]
1292 pmulhrsw m2, m3
1293 packuswb m1, m2
1294 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1295
1296cglobal intra_pred_ang8_5, 3,5,8
1297 cmp r4m, byte 31
1298 cmove r2, r3mp
1299 lea r3, [ang_table + 17 * 16]
1300 lea r4, [ang_table + 2 * 16]
1301 mova m3, [pw_1024]
1302
1303 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1304 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1305
1306 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1307 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1308 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1309 mova m5, m1
1310
1311 pmaddubsw m4, m0, [r3] ; [17]
1312 pmulhrsw m4, m3
1313 pmaddubsw m1, [r4] ; [2]
1314 pmulhrsw m1, m3
1315 packuswb m4, m1
1316
1317 pmaddubsw m5, [r3 + 2 * 16] ; [19]
1318 pmulhrsw m5, m3
1319
1320 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1321 mova m1, m6
1322
1323 pmaddubsw m1, [r4 + 2 * 16] ; [4]
1324 pmulhrsw m1, m3
1325 packuswb m5, m1
1326
1327 pmaddubsw m6, [r3 + 4 * 16] ; [21]
1328 pmulhrsw m6, m3
1329
1330 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1331
1332 mova m7, m1
1333 pmaddubsw m7, [r4 + 4 * 16] ; [6]
1334 pmulhrsw m7, m3
1335 packuswb m6, m7
1336
1337 pmaddubsw m1, [r3 + 6 * 16] ; [23]
1338 pmulhrsw m1, m3
1339
1340 palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9]
1341
1342 pmaddubsw m2, [r4 + 6 * 16] ; [8]
1343 pmulhrsw m2, m3
1344 packuswb m1, m2
1345 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1346
1347cglobal intra_pred_ang8_6, 3,5,8
1348 cmp r4m, byte 30
1349 cmove r2, r3mp
1350 lea r3, [ang_table + 20 * 16]
1351 lea r4, [ang_table + 8 * 16]
1352 mova m7, [pw_1024]
1353
1354 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1355 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1356
1357 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1358 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1359 mova m1, m0
1360
1361 pmaddubsw m4, m0, [r3 - 7 * 16] ; [13]
1362 pmulhrsw m4, m7
1363 pmaddubsw m1, [r3 + 6 * 16] ; [26]
1364 pmulhrsw m1, m7
1365 packuswb m4, m1
1366
1367 palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1368
1369 pmaddubsw m5, m6, [r4 - 1 * 16] ; [7]
1370 pmulhrsw m5, m7
1371
1372 pmaddubsw m6, [r3] ; [20]
1373 pmulhrsw m6, m7
1374 packuswb m5, m6
1375
1376 palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1377
1378 pmaddubsw m6, m1, [r4 - 7 * 16] ; [1]
1379 pmulhrsw m6, m7
1380
1381 mova m3, m1
1382 pmaddubsw m3, [r3 - 6 * 16] ; [14]
1383 pmulhrsw m3, m7
1384 packuswb m6, m3
1385
1386 pmaddubsw m1, [r3 + 7 * 16] ; [27]
1387 pmulhrsw m1, m7
1388
1389 palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1390
1391 pmaddubsw m2, [r4] ; [8]
1392 pmulhrsw m2, m7
1393 packuswb m1, m2
1394 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1395
1396cglobal intra_pred_ang8_7, 3,5,8
1397 cmp r4m, byte 29
1398 cmove r2, r3mp
1399 lea r3, [ang_table + 24 * 16]
1400 lea r4, [ang_table + 6 * 16]
1401 mova m7, [pw_1024]
1402
1403 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1404 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1405
1406 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1407 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1408
1409 pmaddubsw m4, m0, [r4 + 3 * 16] ; [9]
1410 pmulhrsw m4, m7
1411 pmaddubsw m3, m0, [r3 - 6 * 16] ; [18]
1412 pmulhrsw m3, m7
1413 packuswb m4, m3
1414
1415 pmaddubsw m5, m0, [r3 + 3 * 16] ; [27]
1416 pmulhrsw m5, m7
1417
1418 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1419
1420 pmaddubsw m6, m1, [r4 - 2 * 16] ; [4]
1421 pmulhrsw m6, m7
1422 packuswb m5, m6
1423
1424 pmaddubsw m6, m1, [r4 + 7 * 16] ; [13]
1425 pmulhrsw m6, m7
1426
1427 mova m3, m1
1428 pmaddubsw m3, [r3 - 2 * 16] ; [22]
1429 pmulhrsw m3, m7
1430 packuswb m6, m3
1431
1432 pmaddubsw m1, [r3 + 7 * 16] ; [31]
1433 pmulhrsw m1, m7
1434
1435 palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1436
1437 pmaddubsw m2, [r4 + 2 * 16] ; [8]
1438 pmulhrsw m2, m7
1439 packuswb m1, m2
1440 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1441
1442cglobal intra_pred_ang8_8, 3,5,8
1443 cmp r4m, byte 28
1444 cmove r2, r3mp
1445 lea r3, [ang_table + 23 * 16]
1446 lea r4, [ang_table + 8 * 16]
1447 mova m7, [pw_1024]
1448
1449 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1450 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1451
1452 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1453 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1454 palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1455
1456 pmaddubsw m4, m0, [r4 - 3 * 16] ; [5]
1457 pmulhrsw m4, m7
1458 pmaddubsw m3, m0, [r4 + 2 * 16] ; [10]
1459 pmulhrsw m3, m7
1460 packuswb m4, m3
1461
1462 pmaddubsw m5, m0, [r3 - 8 * 16] ; [15]
1463 pmulhrsw m5, m7
1464
1465 pmaddubsw m6, m0, [r3 - 3 * 16] ; [20]
1466 pmulhrsw m6, m7
1467 packuswb m5, m6
1468
1469 pmaddubsw m6, m0, [r3 + 2 * 16] ; [25]
1470 pmulhrsw m6, m7
1471
1472 pmaddubsw m0, [r3 + 7 * 16] ; [30]
1473 pmulhrsw m0, m7
1474 packuswb m6, m0
1475
1476 pmaddubsw m1, m2, [r4 - 5 * 16] ; [3]
1477 pmulhrsw m1, m7
1478
1479 pmaddubsw m2, [r4] ; [8]
1480 pmulhrsw m2, m7
1481 packuswb m1, m2
1482 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1483
1484cglobal intra_pred_ang8_9, 3,5,8
1485 cmp r4m, byte 27
1486 cmove r2, r3mp
1487 lea r3, [ang_table + 10 * 16]
1488 mova m7, [pw_1024]
1489
1490 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1491 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1492
1493 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1494
1495 pmaddubsw m4, m0, [r3 - 8 * 16] ; [2]
1496 pmulhrsw m4, m7
1497 pmaddubsw m3, m0, [r3 - 6 * 16] ; [4]
1498 pmulhrsw m3, m7
1499 packuswb m4, m3
1500
1501 pmaddubsw m5, m0, [r3 - 4 * 16] ; [6]
1502 pmulhrsw m5, m7
1503
1504 pmaddubsw m6, m0, [r3 - 2 * 16] ; [8]
1505 pmulhrsw m6, m7
1506 packuswb m5, m6
1507
1508 pmaddubsw m6, m0, [r3] ; [10]
1509 pmulhrsw m6, m7
1510
1511 pmaddubsw m2, m0, [r3 + 2 * 16] ; [12]
1512 pmulhrsw m2, m7
1513 packuswb m6, m2
1514
1515 pmaddubsw m1, m0, [r3 + 4 * 16] ; [14]
1516 pmulhrsw m1, m7
1517
1518 pmaddubsw m0, [r3 + 6 * 16] ; [16]
1519 pmulhrsw m0, m7
1520 packuswb m1, m0
1521 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1522
1523cglobal intra_pred_ang8_10, 4,5,5
1524 movh m0, [r2 + 1]
1525 mova m4, [pb_unpackbq]
1526 palignr m1, m0, 2
1527 pshufb m1, m4
1528 palignr m2, m0, 4
1529 pshufb m2, m4
1530 palignr m3, m0, 6
1531 pshufb m3, m4
1532 pshufb m0, m4
1533
1534 lea r4, [r1 * 3]
1535 movhps [r0 + r1], m0
1536 movh [r0 + r1 * 2], m1
1537 movhps [r0 + r4], m1
1538 lea r2, [r0 + r1 * 4]
1539 movh [r2], m2
1540 movhps [r2 + r1], m2
1541 movh [r2 + r1 * 2], m3
1542 movhps [r2 + r4], m3
1543
1544; filter
1545 cmp r5m, byte 0
1546 jz .quit
1547
1548 pmovzxbw m0, m0
1549 movu m1, [r3]
1550 palignr m2, m1, 1
1551 pshufb m1, m4
1552 pmovzxbw m1, m1
1553 pmovzxbw m2, m2
1554 psubw m2, m1
1555 psraw m2, 1
1556 paddw m0, m2
1557 packuswb m0, m0
1558
1559.quit:
1560 movh [r0], m0
1561 RET
1562
1563cglobal intra_pred_ang8_26, 4,5,3
1564 movh m0, [r3 + 1]
1565
1566 lea r4, [r1 * 3]
1567 movh [r0], m0
1568 movh [r0 + r1], m0
1569 movh [r0 + r1 * 2], m0
1570 movh [r0 + r4], m0
1571 lea r3, [r0 + r1 * 4]
1572 movh [r3], m0
1573 movh [r3 + r1], m0
1574 movh [r3 + r1 * 2], m0
1575 movh [r3 + r4], m0
1576
1577; filter
1578 cmp r5m, byte 0
1579 jz .quit
1580
1581 pshufb m0, [pb_unpackbq]
1582 pmovzxbw m0, m0
1583 movu m1, [r2]
1584 palignr m2, m1, 1
1585 pshufb m1, [pb_unpackbq]
1586 pmovzxbw m1, m1
1587 pmovzxbw m2, m2
1588 psubw m2, m1
1589 psraw m2, 1
1590 paddw m0, m2
1591 packuswb m0, m0
1592 pextrb [r0], m0, 0
1593 pextrb [r0 + r1], m0, 1
1594 pextrb [r0 + r1 * 2], m0, 2
1595 pextrb [r0 + r4], m0, 3
1596 pextrb [r3], m0, 4
1597 pextrb [r3 + r1], m0, 5
1598 pextrb [r3 + r1 * 2], m0, 6
1599 pextrb [r3 + r4], m0, 7
1600
1601.quit:
1602 RET
1603
1604cglobal intra_pred_ang8_11, 3,5,8
1605 cmp r4m, byte 25
1606 cmove r2, r3mp
1607 lea r3, [ang_table + 23 * 16]
1608 mova m7, [pw_1024]
1609
1610 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1611 palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1612
1613 punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1614
1615 pmaddubsw m4, m0, [r3 + 7 * 16] ; [30]
1616 pmulhrsw m4, m7
1617 pmaddubsw m3, m0, [r3 + 5 * 16] ; [28]
1618 pmulhrsw m3, m7
1619 packuswb m4, m3
1620
1621 pmaddubsw m5, m0, [r3 + 3 * 16] ; [26]
1622 pmulhrsw m5, m7
1623
1624 pmaddubsw m6, m0, [r3 + 1 * 16] ; [24]
1625 pmulhrsw m6, m7
1626 packuswb m5, m6
1627
1628 pmaddubsw m6, m0, [r3 - 1 * 16] ; [22]
1629 pmulhrsw m6, m7
1630
1631 pmaddubsw m2, m0, [r3 - 3 * 16] ; [20]
1632 pmulhrsw m2, m7
1633 packuswb m6, m2
1634
1635 pmaddubsw m1, m0, [r3 - 5 * 16] ; [18]
1636 pmulhrsw m1, m7
1637
1638 pmaddubsw m0, [r3 - 7 * 16] ; [16]
1639 pmulhrsw m0, m7
1640 packuswb m1, m0
1641 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1642
1643cglobal intra_pred_ang8_12, 4,5,8
1644 cmp r4m, byte 24
1645 mov r4, r2
1646 cmovz r2, r3
1647 cmovz r3, r4
1648
1649 lea r4, [ang_table + 22 * 16]
1650 mova m7, [pw_1024]
1651
1652 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1653 pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
1654 pinsrb m0, [r3 + 6], 0
1655 punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
1656 punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1657 palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1658
1659 pmaddubsw m4, m2, [r4 + 5 * 16] ; [27]
1660 pmulhrsw m4, m7
1661 pmaddubsw m3, m2, [r4] ; [22]
1662 pmulhrsw m3, m7
1663 packuswb m4, m3
1664
1665 pmaddubsw m1, m0, [r4 + 7 * 16] ; [29]
1666 pmulhrsw m1, m7
1667
1668 pmaddubsw m0, [r4 + 2 * 16] ; [24]
1669 pmulhrsw m0, m7
1670 packuswb m1, m0
1671
1672 pmaddubsw m5, m2, [r4 - 5 * 16] ; [17]
1673 pmulhrsw m5, m7
1674
1675 lea r4, [ang_table + 7 * 16]
1676 pmaddubsw m6, m2, [r4 + 5 * 16] ; [12]
1677 pmulhrsw m6, m7
1678 packuswb m5, m6
1679
1680 pmaddubsw m6, m2, [r4] ; [7]
1681 pmulhrsw m6, m7
1682
1683 pmaddubsw m2, [r4 - 5 * 16] ; [2]
1684 pmulhrsw m2, m7
1685 packuswb m6, m2
1686 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1687
1688cglobal intra_pred_ang8_13, 4,5,8
1689 cmp r4m, byte 23
1690 mov r4, r2
1691 cmovz r2, r3
1692 cmovz r3, r4
1693
1694 lea r4, [ang_table + 24 * 16]
1695 mova m7, [pw_1024]
1696
1697 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1698 pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
1699 pinsrb m1, [r3 + 4], 0
1700 pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
1701 pinsrb m0, [r3 + 7], 0
1702 punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
1703 punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1704 palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1705 palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1706
1707 pmaddubsw m4, m5, [r4 - 1 * 16] ; [23]
1708 pmulhrsw m4, m7
1709
1710 pmaddubsw m6, m1, [r4 + 4 * 16] ; [28]
1711 pmulhrsw m6, m7
1712
1713 pmaddubsw m0, [r4] ; [24]
1714 pmulhrsw m0, m7
1715
1716 lea r4, [ang_table + 13 * 16]
1717 pmaddubsw m3, m5, [r4 + 1 * 16] ; [14]
1718 pmulhrsw m3, m7
1719 packuswb m4, m3
1720
1721 pmaddubsw m5, [r4 - 8 * 16] ; [5]
1722 pmulhrsw m5, m7
1723 packuswb m5, m6
1724
1725 pmaddubsw m6, m1, [r4 + 6 * 16] ; [19]
1726 pmulhrsw m6, m7
1727
1728 pmaddubsw m2, m1, [r4 - 3 * 16] ; [10]
1729 pmulhrsw m2, m7
1730 packuswb m6, m2
1731
1732 pmaddubsw m1, [r4 - 12 * 16] ; [1]
1733 pmulhrsw m1, m7
1734 packuswb m1, m0
1735 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1736
1737cglobal intra_pred_ang8_14, 4,5,8
1738 cmp r4m, byte 22
1739 mov r4, r2
1740 cmovz r2, r3
1741 cmovz r3, r4
1742
1743 lea r4, [ang_table + 24 * 16]
1744 mova m3, [pw_1024]
1745
1746 movu m1, [r2 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
1747 pinsrb m1, [r3 + 2], 1
1748 pinsrb m1, [r3 + 5], 0
1749 pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
1750 pinsrb m0, [r3 + 7], 0
1751 punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
1752 punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1753 palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1754 palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1755 palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1756
1757 pmaddubsw m4, m2, [r4 - 5 * 16] ; [19]
1758 pmulhrsw m4, m3
1759
1760 pmaddubsw m0, [r4] ; [24]
1761 pmulhrsw m0, m3
1762
1763 pmaddubsw m5, m6, [r4 + 1 * 16] ; [25]
1764 pmulhrsw m5, m3
1765
1766 lea r4, [ang_table + 12 * 16]
1767 pmaddubsw m6, [r4] ; [12]
1768 pmulhrsw m6, m3
1769 packuswb m5, m6
1770
1771 pmaddubsw m6, m1, [r4 + 19 * 16] ; [31]
1772 pmulhrsw m6, m3
1773
1774 pmaddubsw m2, [r4 - 6 * 16] ; [6]
1775 pmulhrsw m2, m3
1776 packuswb m4, m2
1777
1778 pmaddubsw m2, m1, [r4 + 6 * 16] ; [18]
1779 pmulhrsw m2, m3
1780 packuswb m6, m2
1781
1782 pmaddubsw m1, [r4 - 7 * 16] ; [5]
1783 pmulhrsw m1, m3
1784 packuswb m1, m0
1785 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1786
1787cglobal intra_pred_ang8_15, 4,5,8
1788 cmp r4m, byte 21
1789 mov r4, r2
1790 cmovz r2, r3
1791 cmovz r3, r4
1792
1793 lea r4, [ang_table + 23 * 16]
1794 mova m3, [pw_1024]
1795
1796 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1797 movu m2, [r3]
1798 pshufb m2, [c_mode16_15]
1799 palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
1800 pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
1801 pinsrb m0, [r3 + 8], 0
1802 punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1803 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
1804 palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1805 palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1806 palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1807 palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1808
1809 pmaddubsw m4, [r4 - 8 * 16] ; [15]
1810 pmulhrsw m4, m3
1811
1812 pmaddubsw m2, m5, [r4 + 7 * 16] ; [30]
1813 pmulhrsw m2, m3
1814 packuswb m4, m2
1815
1816 pmaddubsw m5, [r4 - 10 * 16] ; [13]
1817 pmulhrsw m5, m3
1818
1819 pmaddubsw m2, m6, [r4 + 5 * 16] ; [28]
1820 pmulhrsw m2, m3
1821 packuswb m5, m2
1822
1823 pmaddubsw m2, m1, [r4 + 3 * 16] ; [26]
1824 pmulhrsw m2, m3
1825
1826 pmaddubsw m0, [r4 + 1 * 16] ; [24]
1827 pmulhrsw m0, m3
1828
1829 lea r4, [ang_table + 11 * 16]
1830 pmaddubsw m6, [r4] ; [11]
1831 pmulhrsw m6, m3
1832 packuswb m6, m2
1833
1834 pmaddubsw m1, [r4 - 2 * 16] ; [9]
1835 pmulhrsw m1, m3
1836 packuswb m1, m0
1837 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1838
1839cglobal intra_pred_ang8_16, 4,5,8
1840 cmp r4m, byte 20
1841 mov r4, r2
1842 cmovz r2, r3
1843 cmovz r3, r4
1844
1845 lea r4, [ang_table + 22 * 16]
1846 mova m7, [pw_1024]
1847
1848 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1849 movu m2, [r3]
1850 pshufb m2, [c_mode16_16]
1851 palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
1852 pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
1853 pinsrb m0, [r3 + 8], 0
1854 punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1855 punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e]
1856 palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
1857 palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1858 palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1859 palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1860 palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1861
1862 pmaddubsw m3, m5, [r4] ; [22]
1863 pmulhrsw m3, m7
1864
1865 pmaddubsw m0, [r4 + 2 * 16] ; [24]
1866 pmulhrsw m0, m7
1867
1868 lea r4, [ang_table + 9 * 16]
1869
1870 pmaddubsw m4, [r4 + 2 * 16] ; [11]
1871 pmulhrsw m4, m7
1872 packuswb m4, m3
1873
1874 pmaddubsw m2, [r4 + 3 * 16] ; [12]
1875 pmulhrsw m2, m7
1876
1877 pmaddubsw m5, [r4 - 8 * 16] ; [1]
1878 pmulhrsw m5, m7
1879 packuswb m5, m2
1880
1881 mova m2, m6
1882 pmaddubsw m6, [r4 + 14 * 16] ; [23]
1883 pmulhrsw m6, m7
1884
1885 pmaddubsw m2, [r4 - 7 * 16] ; [2]
1886 pmulhrsw m2, m7
1887 packuswb m6, m2
1888
1889 pmaddubsw m1, [r4 + 4 * 16] ; [13]
1890 pmulhrsw m1, m7
1891 packuswb m1, m0
1892 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1893
1894cglobal intra_pred_ang8_17, 4,5,8
1895 cmp r4m, byte 19
1896 mov r4, r2
1897 cmovz r2, r3
1898 cmovz r3, r4
1899
1900 lea r4, [ang_table + 17 * 16]
1901 mova m3, [pw_1024]
1902
1903 movu m2, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1904 movu m1, [r3]
1905 pshufb m1, [c_mode16_17]
1906 palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
1907 pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f]
1908 pinsrb m0, [r3 + 7], 0
1909 punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1910 punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f]
1911
1912 palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1913 palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1914 palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1915
1916
1917 pmaddubsw m2, [r4 - 5 * 16] ; [12]
1918 pmulhrsw m2, m3
1919
1920 pmaddubsw m4, [r4 - 11 * 16] ; [6]
1921 pmulhrsw m4, m3
1922 packuswb m4, m2
1923
1924 pmaddubsw m5, [r4 + 1 * 16] ; [18]
1925 pmulhrsw m5, m3
1926
1927 palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1928 pmaddubsw m2, [r4 + 7 * 16] ; [24]
1929 pmulhrsw m2, m3
1930 packuswb m5, m2
1931
1932 palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
1933 mova m2, m6
1934 pmaddubsw m6, [r4 + 13 * 16] ; [30]
1935 pmulhrsw m6, m3
1936
1937 pmaddubsw m2, [r4 - 13 * 16] ; [4]
1938 pmulhrsw m2, m3
1939 packuswb m6, m2
1940
1941 palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e]
1942 pmaddubsw m1, [r4 - 7 * 16] ; [10]
1943 pmulhrsw m1, m3
1944
1945 pmaddubsw m0, [r4 - 1 * 16] ; [16]
1946 pmulhrsw m0, m3
1947 packuswb m1, m0
1948 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1949
1950cglobal intra_pred_ang8_18, 4,4,1
1951 movu m0, [r2]
1952 pshufb m0, [pb_swap8]
1953 movhps m0, [r3 + 1]
1954 lea r2, [r0 + r1 * 4]
1955 lea r3, [r1 * 3]
1956 movh [r2 + r3], m0
1957 psrldq m0, 1
1958 movh [r2 + r1 * 2], m0
1959 psrldq m0, 1
1960 movh [r2 + r1], m0
1961 psrldq m0, 1
1962 movh [r2], m0
1963 psrldq m0, 1
1964 movh [r0 + r3], m0
1965 psrldq m0, 1
1966 movh [r0 + r1 * 2], m0
1967 psrldq m0, 1
1968 movh [r0 + r1], m0
1969 psrldq m0, 1
1970 movh [r0], m0
1971 RET
1972
1973
1974;-----------------------------------------------------------------------------
1975; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
1976;-----------------------------------------------------------------------------
1977INIT_XMM ssse3
1978cglobal intra_pred_ang16_2, 3,3,3
1979 cmp r4m, byte 34
1980 cmove r2, r3mp
1981 movu m0, [r2 + 2]
1982 movu m1, [r2 + 18]
1983 movu [r0], m0
1984 palignr m2, m1, m0, 1
1985 movu [r0 + r1], m2
1986 lea r0, [r0 + r1 * 2]
1987 palignr m2, m1, m0, 2
1988 movu [r0], m2
1989 palignr m2, m1, m0, 3
1990 movu [r0 + r1], m2
1991 lea r0, [r0 + r1 * 2]
1992 palignr m2, m1, m0, 4
1993 movu [r0], m2
1994 palignr m2, m1, m0, 5
1995 movu [r0 + r1], m2
1996 lea r0, [r0 + r1 * 2]
1997 palignr m2, m1, m0, 6
1998 movu [r0], m2
1999 palignr m2, m1, m0, 7
2000 movu [r0 + r1], m2
2001 lea r0, [r0 + r1 * 2]
2002 palignr m2, m1, m0, 8
2003 movu [r0], m2
2004 palignr m2, m1, m0, 9
2005 movu [r0 + r1], m2
2006 lea r0, [r0 + r1 * 2]
2007 palignr m2, m1, m0, 10
2008 movu [r0], m2
2009 palignr m2, m1, m0, 11
2010 movu [r0 + r1], m2
2011 lea r0, [r0 + r1 * 2]
2012 palignr m2, m1, m0, 12
2013 movu [r0], m2
2014 palignr m2, m1, m0, 13
2015 movu [r0 + r1], m2
2016 lea r0, [r0 + r1 * 2]
2017 palignr m2, m1, m0, 14
2018 movu [r0], m2
2019 palignr m2, m1, m0, 15
2020 movu [r0 + r1], m2
2021 RET
2022
2023%macro TRANSPOSE_STORE_8x8 6
2024 %if %2 == 1
2025 ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
2026 punpckhbw m0, %3, %4
2027 punpcklbw %3, %4
2028 punpckhbw %4, %3, m0
2029 punpcklbw %3, m0
2030
2031 punpckhbw m0, %5, m1
2032 punpcklbw %5, %6
2033 punpckhbw %6, %5, m0
2034 punpcklbw %5, m0
2035
2036 punpckhdq m0, %3, %5
2037 punpckldq %3, %5
2038 punpckldq %5, %4, %6
2039 punpckhdq %4, %6
2040
2041 movh [r0 + + %1 * 8], %3
2042 movhps [r0 + r1 + %1 * 8], %3
2043 movh [r0 + r1*2 + %1 * 8], m0
2044 movhps [r0 + r5 + %1 * 8], m0
2045 movh [r6 + %1 * 8], %5
2046 movhps [r6 + r1 + %1 * 8], %5
2047 movh [r6 + r1*2 + %1 * 8], %4
2048 movhps [r6 + r5 + %1 * 8], %4
2049 %else
2050 ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
2051 movh [r0 ], %3
2052 movhps [r0 + r1 ], %3
2053 movh [r0 + r1 * 2], %4
2054 movhps [r0 + r5 ], %4
2055 lea r0, [r0 + r1 * 4]
2056 movh [r0 ], %5
2057 movhps [r0 + r1 ], %5
2058 movh [r0 + r1 * 2], %6
2059 movhps [r0 + r5 ], %6
2060 lea r0, [r0 + r1 * 4]
2061 %endif
2062%endmacro
2063
2064INIT_XMM sse4
2065cglobal intra_pred_ang16_3, 3,7,8
2066
2067 lea r3, [ang_table + 16 * 16]
2068 mov r4d, 2
2069 lea r5, [r1 * 3] ; r5 -> 3 * stride
2070 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2071 mova m7, [pw_1024]
2072
2073.loop:
2074 movu m0, [r2 + 1]
2075 palignr m1, m0, 1
2076
2077 punpckhbw m2, m0, m1
2078 punpcklbw m0, m1
2079 palignr m1, m2, m0, 2
2080
2081 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
2082 pmulhrsw m4, m7
2083 pmaddubsw m1, [r3 + 4 * 16] ; [20]
2084 pmulhrsw m1, m7
2085 packuswb m4, m1
2086
2087 palignr m5, m2, m0, 4
2088
2089 pmaddubsw m5, [r3 - 2 * 16] ; [14]
2090 pmulhrsw m5, m7
2091
2092 palignr m6, m2, m0, 6
2093
2094 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
2095 pmulhrsw m6, m7
2096 packuswb m5, m6
2097
2098 palignr m1, m2, m0, 8
2099
2100 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
2101 pmulhrsw m6, m7
2102
2103 pmaddubsw m1, [r3 + 12 * 16] ; [28]
2104 pmulhrsw m1, m7
2105 packuswb m6, m1
2106
2107 palignr m1, m2, m0, 10
2108
2109 pmaddubsw m1, [r3 + 6 * 16] ; [22]
2110 pmulhrsw m1, m7
2111
2112 palignr m2, m0, 12
2113
2114 pmaddubsw m2, [r3] ; [16]
2115 pmulhrsw m2, m7
2116 packuswb m1, m2
2117
2118 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2119
2120 movu m0, [r2 + 8]
2121 palignr m1, m0, 1
2122
2123 punpckhbw m2, m0, m1
2124 punpcklbw m0, m1
2125 palignr m5, m2, m0, 2
2126
2127 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
2128 pmulhrsw m4, m7
2129 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
2130 pmulhrsw m1, m7
2131 packuswb m4, m1
2132
2133 pmaddubsw m5, [r3 + 14 * 16] ; [30]
2134 pmulhrsw m5, m7
2135
2136 palignr m6, m2, m0, 4
2137
2138 pmaddubsw m6, [r3 + 8 * 16] ; [24]
2139 pmulhrsw m6, m7
2140 packuswb m5, m6
2141
2142 palignr m1, m2, m0, 6
2143
2144 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
2145 pmulhrsw m6, m7
2146
2147 palignr m1, m2, m0, 8
2148
2149 pmaddubsw m1, [r3 - 4 * 16] ; [12]
2150 pmulhrsw m1, m7
2151 packuswb m6, m1
2152
2153 palignr m1, m2, m0, 10
2154
2155 pmaddubsw m1, [r3 - 10 * 16] ; [06]
2156 pmulhrsw m1, m7
2157 packuswb m1, m1
2158
2159 movhps m1, [r2 + 14] ; [00]
2160
2161 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2162
2163 lea r0, [r6 + r1 * 4]
2164 lea r6, [r6 + r1 * 8]
2165 add r2, 8
2166 dec r4
2167 jnz .loop
2168
2169 RET
2170
2171INIT_XMM sse4
2172cglobal intra_pred_ang16_33, 3,7,8
2173 mov r2, r3mp
2174 lea r3, [ang_table + 16 * 16]
2175 mov r4d, 2
2176 lea r5, [r1 * 3]
2177 mov r6, r0
2178 mova m7, [pw_1024]
2179
2180.loop:
2181 movu m0, [r2 + 1]
2182 palignr m1, m0, 1
2183
2184 punpckhbw m2, m0, m1
2185 punpcklbw m0, m1
2186 palignr m1, m2, m0, 2
2187
2188 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
2189 pmulhrsw m4, m7
2190 pmaddubsw m1, [r3 + 4 * 16] ; [20]
2191 pmulhrsw m1, m7
2192 packuswb m4, m1
2193
2194 palignr m5, m2, m0, 4
2195
2196 pmaddubsw m5, [r3 - 2 * 16] ; [14]
2197 pmulhrsw m5, m7
2198
2199 palignr m6, m2, m0, 6
2200
2201 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
2202 pmulhrsw m6, m7
2203 packuswb m5, m6
2204
2205 palignr m1, m2, m0, 8
2206
2207 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
2208 pmulhrsw m6, m7
2209
2210 pmaddubsw m1, [r3 + 12 * 16] ; [28]
2211 pmulhrsw m1, m7
2212 packuswb m6, m1
2213
2214 palignr m1, m2, m0, 10
2215
2216 pmaddubsw m1, [r3 + 6 * 16] ; [22]
2217 pmulhrsw m1, m7
2218
2219 palignr m2, m0, 12
2220
2221 pmaddubsw m2, [r3] ; [16]
2222 pmulhrsw m2, m7
2223 packuswb m1, m2
2224
2225 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2226
2227 movu m0, [r2 + 8]
2228 palignr m1, m0, 1
2229
2230 punpckhbw m2, m0, m1
2231 punpcklbw m0, m1
2232 palignr m5, m2, m0, 2
2233
2234 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
2235 pmulhrsw m4, m7
2236 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
2237 pmulhrsw m1, m7
2238 packuswb m4, m1
2239
2240 pmaddubsw m5, [r3 + 14 * 16] ; [30]
2241 pmulhrsw m5, m7
2242
2243 palignr m6, m2, m0, 4
2244
2245 pmaddubsw m6, [r3 + 8 * 16] ; [24]
2246 pmulhrsw m6, m7
2247 packuswb m5, m6
2248
2249 palignr m1, m2, m0, 6
2250
2251 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
2252 pmulhrsw m6, m7
2253
2254 palignr m1, m2, m0, 8
2255
2256 pmaddubsw m1, [r3 - 4 * 16] ; [12]
2257 pmulhrsw m1, m7
2258 packuswb m6, m1
2259
2260 palignr m1, m2, m0, 10
2261
2262 pmaddubsw m1, [r3 - 10 * 16] ; [06]
2263 pmulhrsw m1, m7
2264 packuswb m1, m1
2265
2266 movh m2, [r2 + 14] ; [00]
2267
2268 movh [r0 ], m4
2269 movhps [r0 + r1 ], m4
2270 movh [r0 + r1 * 2], m5
2271 movhps [r0 + r5 ], m5
2272 lea r0, [r0 + r1 * 4]
2273 movh [r0 ], m6
2274 movhps [r0 + r1 ], m6
2275 movh [r0 + r1 * 2], m1
2276 movh [r0 + r5 ], m2
2277
2278 lea r0, [r6 + 8]
2279 add r2, 8
2280 dec r4
2281 jnz .loop
2282
2283 RET
2284
2285INIT_XMM sse4
2286cglobal intra_pred_ang16_4, 3,7,8
2287
2288 lea r3, [ang_table + 16 * 16]
2289 mov r4d, 2
2290 lea r5, [r1 * 3] ; r5 -> 3 * stride
2291 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2292 mova m7, [pw_1024]
2293
2294.loop:
2295 movu m0, [r2 + 1]
2296 palignr m1, m0, 1
2297
2298 punpckhbw m2, m0, m1
2299 punpcklbw m0, m1
2300 palignr m1, m2, m0, 2
2301 mova m5, m1
2302
2303 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
2304 pmulhrsw m4, m7
2305 pmaddubsw m1, [r3 - 6 * 16] ; [10]
2306 pmulhrsw m1, m7
2307 packuswb m4, m1
2308
2309 pmaddubsw m5, [r3 + 15 * 16] ; [31]
2310 pmulhrsw m5, m7
2311
2312 palignr m6, m2, m0, 4
2313
2314 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
2315 pmulhrsw m6, m7
2316 packuswb m5, m6
2317
2318 palignr m1, m2, m0, 6
2319
2320 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
2321 pmulhrsw m6, m7
2322
2323 pmaddubsw m1, [r3 + 14 * 16] ; [30]
2324 pmulhrsw m1, m7
2325 packuswb m6, m1
2326
2327 palignr m1, m2, m0, 8
2328
2329 pmaddubsw m1, [r3 + 3 * 16] ; [19]
2330 pmulhrsw m1, m7
2331
2332 palignr m2, m0, 10
2333
2334 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
2335 pmulhrsw m3, m7
2336 packuswb m1, m3
2337
2338 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2339
2340 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
2341 pmulhrsw m4, m7
2342
2343 movu m0, [r2 + 6]
2344 palignr m1, m0, 1
2345
2346 punpckhbw m2, m0, m1
2347 punpcklbw m0, m1
2348 palignr m1, m2, m0, 2
2349
2350 pmaddubsw m1, [r3 + 2 * 16] ; [18]
2351 pmulhrsw m1, m7
2352 packuswb m4, m1
2353
2354 palignr m5, m2, m0, 4
2355 mova m6, m5
2356
2357 pmaddubsw m5, [r3 - 9 * 16] ; [07]
2358 pmulhrsw m5, m7
2359
2360 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2361 pmulhrsw m6, m7
2362 packuswb m5, m6
2363
2364 palignr m6, m2, m0, 6
2365
2366 pmaddubsw m6, [r3 + 16] ; [17]
2367 pmulhrsw m6, m7
2368
2369 palignr m1, m2, m0, 8
2370 palignr m2, m0, 10
2371
2372 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
2373 pmulhrsw m3, m7
2374 packuswb m6, m3
2375
2376 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2377 pmulhrsw m1, m7
2378
2379 pmaddubsw m2, [r3] ; [16]
2380 pmulhrsw m2, m7
2381 packuswb m1, m2
2382
2383 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2384
2385 lea r0, [r6 + r1 * 4]
2386 lea r6, [r6 + r1 * 8]
2387 add r2, 8
2388 dec r4
2389 jnz .loop
2390
2391 RET
2392
2393INIT_XMM sse4
2394cglobal intra_pred_ang16_32, 3,7,8
2395 mov r2, r3mp
2396 lea r3, [ang_table + 16 * 16]
2397 mov r4d, 2
2398 lea r5, [r1 * 3] ; r5 -> 3 * stride
2399 mov r6, r0
2400 mova m7, [pw_1024]
2401
2402.loop:
2403 movu m0, [r2 + 1]
2404 palignr m1, m0, 1
2405
2406 punpckhbw m2, m0, m1
2407 punpcklbw m0, m1
2408 palignr m1, m2, m0, 2
2409 mova m5, m1
2410
2411
2412 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
2413 pmulhrsw m4, m7
2414 pmaddubsw m1, [r3 - 6 * 16] ; [10]
2415 pmulhrsw m1, m7
2416 packuswb m4, m1
2417
2418 pmaddubsw m5, [r3 + 15 * 16] ; [31]
2419 pmulhrsw m5, m7
2420
2421 palignr m6, m2, m0, 4
2422
2423 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
2424 pmulhrsw m6, m7
2425 packuswb m5, m6
2426
2427 palignr m1, m2, m0, 6
2428
2429 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
2430 pmulhrsw m6, m7
2431
2432 pmaddubsw m1, [r3 + 14 * 16] ; [30]
2433 pmulhrsw m1, m7
2434 packuswb m6, m1
2435
2436 palignr m1, m2, m0, 8
2437
2438 pmaddubsw m1, [r3 + 3 * 16] ; [19]
2439 pmulhrsw m1, m7
2440
2441 palignr m2, m0, 10
2442
2443 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
2444 pmulhrsw m3, m7
2445 packuswb m1, m3
2446
2447 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2448
2449 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
2450 pmulhrsw m4, m7
2451
2452 movu m0, [r2 + 6]
2453 palignr m1, m0, 1
2454
2455 punpckhbw m2, m0, m1
2456 punpcklbw m0, m1
2457 palignr m1, m2, m0, 2
2458
2459 pmaddubsw m1, [r3 + 2 * 16] ; [18]
2460 pmulhrsw m1, m7
2461 packuswb m4, m1
2462
2463 palignr m5, m2, m0, 4
2464 mova m6, m5
2465
2466 pmaddubsw m5, [r3 - 9 * 16] ; [07]
2467 pmulhrsw m5, m7
2468
2469 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2470 pmulhrsw m6, m7
2471 packuswb m5, m6
2472
2473 palignr m6, m2, m0, 6
2474
2475 pmaddubsw m6, [r3 + 16] ; [17]
2476 pmulhrsw m6, m7
2477
2478 palignr m1, m2, m0, 8
2479 palignr m2, m0, 10
2480
2481 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
2482 pmulhrsw m3, m7
2483 packuswb m6, m3
2484
2485 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2486 pmulhrsw m1, m7
2487
2488 pmaddubsw m2, [r3] ; [16]
2489 pmulhrsw m2, m7
2490 packuswb m1, m2
2491
2492 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
2493
2494 lea r0, [r6 + 8]
2495 add r2, 8
2496 dec r4
2497 jnz .loop
2498
2499 RET
2500
2501INIT_XMM sse4
2502cglobal intra_pred_ang16_5, 3,7,8
2503
2504 lea r3, [ang_table + 16 * 16]
2505 mov r4d, 2
2506 lea r5, [r1 * 3] ; r5 -> 3 * stride
2507 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2508 mova m7, [pw_1024]
2509
2510.loop:
2511 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2512 movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2513 punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2514 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2515
2516 palignr m5, m2, m3, 2
2517
2518 pmaddubsw m4, m3, [r3 + 16] ; [17]
2519 pmulhrsw m4, m7
2520 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
2521 pmulhrsw m1, m7
2522 packuswb m4, m1
2523
2524 palignr m6, m2, m3, 4
2525
2526 pmaddubsw m5, [r3 + 3 * 16] ; [19]
2527 pmulhrsw m5, m7
2528 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
2529 pmulhrsw m1, m7
2530 packuswb m5, m1
2531
2532 palignr m1, m2, m3, 6
2533
2534 pmaddubsw m6, [r3 + 5 * 16] ; [21]
2535 pmulhrsw m6, m7
2536 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
2537 pmulhrsw m0, m7
2538 packuswb m6, m0
2539
2540 palignr m0, m2, m3, 8
2541
2542 pmaddubsw m1, [r3 + 7 * 16] ; [23]
2543 pmulhrsw m1, m7
2544 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2545 pmulhrsw m0, m7
2546 packuswb m1, m0
2547
2548 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2549
2550 palignr m4, m2, m3, 8
2551 palignr m5, m2, m3, 10
2552
2553 pmaddubsw m4, [r3 + 9 * 16] ; [25]
2554 pmulhrsw m4, m7
2555 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
2556 pmulhrsw m1, m7
2557 packuswb m4, m1
2558
2559 palignr m6, m2, m3, 12
2560
2561 pmaddubsw m5, [r3 + 11 * 16] ; [27]
2562 pmulhrsw m5, m7
2563 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
2564 pmulhrsw m1, m7
2565 packuswb m5, m1
2566
2567 palignr m1, m2, m3, 14
2568
2569 pmaddubsw m6, [r3 + 13 * 16] ; [29]
2570 pmulhrsw m6, m7
2571 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2572 pmulhrsw m0, m7
2573 packuswb m6, m0
2574
2575 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2576 pmulhrsw m1, m7
2577 pmaddubsw m2, [r3] ; [16]
2578 pmulhrsw m2, m7
2579 packuswb m1, m2
2580
2581 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2582
2583 lea r0, [r6 + r1 * 4]
2584 lea r6, [r6 + r1 * 8]
2585 add r2, 8
2586 dec r4
2587 jnz .loop
2588
2589 RET
2590
2591INIT_XMM sse4
2592cglobal intra_pred_ang16_31, 3,7,8
2593 mov r2, r3mp
2594 lea r3, [ang_table + 16 * 16]
2595 mov r4d, 2
2596 lea r5, [r1 * 3] ; r5 -> 3 * stride
2597 mov r6, r0
2598 mova m7, [pw_1024]
2599
2600.loop:
2601 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2602 movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2603 punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2604 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2605
2606 palignr m5, m2, m3, 2
2607
2608 pmaddubsw m4, m3, [r3 + 16] ; [17]
2609 pmulhrsw m4, m7
2610 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
2611 pmulhrsw m1, m7
2612 packuswb m4, m1
2613
2614 palignr m6, m2, m3, 4
2615
2616 pmaddubsw m5, [r3 + 3 * 16] ; [19]
2617 pmulhrsw m5, m7
2618 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
2619 pmulhrsw m1, m7
2620 packuswb m5, m1
2621
2622 palignr m1, m2, m3, 6
2623
2624 pmaddubsw m6, [r3 + 5 * 16] ; [21]
2625 pmulhrsw m6, m7
2626 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
2627 pmulhrsw m0, m7
2628 packuswb m6, m0
2629
2630 palignr m0, m2, m3, 8
2631
2632 pmaddubsw m1, [r3 + 7 * 16] ; [23]
2633 pmulhrsw m1, m7
2634 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2635 pmulhrsw m0, m7
2636 packuswb m1, m0
2637
2638 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2639
2640 palignr m4, m2, m3, 8
2641 palignr m5, m2, m3, 10
2642
2643 pmaddubsw m4, [r3 + 9 * 16] ; [25]
2644 pmulhrsw m4, m7
2645 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
2646 pmulhrsw m1, m7
2647 packuswb m4, m1
2648
2649 palignr m6, m2, m3, 12
2650
2651 pmaddubsw m5, [r3 + 11 * 16] ; [27]
2652 pmulhrsw m5, m7
2653 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
2654 pmulhrsw m1, m7
2655 packuswb m5, m1
2656
2657 palignr m1, m2, m3, 14
2658
2659 pmaddubsw m6, [r3 + 13 * 16] ; [29]
2660 pmulhrsw m6, m7
2661 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2662 pmulhrsw m0, m7
2663 packuswb m6, m0
2664
2665 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2666 pmulhrsw m1, m7
2667 pmaddubsw m2, [r3] ; [16]
2668 pmulhrsw m2, m7
2669 packuswb m1, m2
2670
2671 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
2672
2673 lea r0, [r6 + 8]
2674 add r2, 8
2675 dec r4
2676 jnz .loop
2677
2678 RET
2679
2680INIT_XMM sse4
2681cglobal intra_pred_ang16_6, 3,7,8
2682
2683 lea r3, [ang_table + 16 * 16]
2684 mov r4d, 2
2685 lea r5, [r1 * 3] ; r5 -> 3 * stride
2686 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2687 mova m7, [pw_1024]
2688
2689.loop:
2690 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2691 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2692 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2693 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2694
2695 pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
2696 pmulhrsw m4, m7
2697 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
2698 pmulhrsw m1, m7
2699 packuswb m4, m1
2700
2701 palignr m6, m2, m3, 2
2702
2703 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
2704 pmulhrsw m5, m7
2705 pmaddubsw m6, [r3 + 4 * 16] ; [20]
2706 pmulhrsw m6, m7
2707 packuswb m5, m6
2708
2709 palignr m1, m2, m3, 4
2710
2711 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
2712 pmulhrsw m6, m7
2713 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2714 pmulhrsw m0, m7
2715 packuswb m6, m0
2716
2717 palignr m0, m2, m3, 6
2718
2719 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2720 pmulhrsw m1, m7
2721 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2722 pmulhrsw m0, m7
2723 packuswb m1, m0
2724
2725 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2726
2727 palignr m4, m2, m3, 6
2728 palignr m6, m2, m3, 8
2729
2730 pmaddubsw m4, [r3 + 5 * 16] ; [21]
2731 pmulhrsw m4, m7
2732 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
2733 pmulhrsw m1, m7
2734 packuswb m4, m1
2735
2736 pmaddubsw m5, m6, [r3 - 16] ; [15]
2737 pmulhrsw m5, m7
2738 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2739 pmulhrsw m6, m7
2740 packuswb m5, m6
2741
2742 palignr m0, m2, m3, 10
2743
2744 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
2745 pmulhrsw m6, m7
2746 pmaddubsw m0, [r3 + 6 * 16] ; [22]
2747 pmulhrsw m0, m7
2748 packuswb m6, m0
2749
2750 palignr m2, m3, 12
2751
2752 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
2753 pmulhrsw m1, m7
2754 pmaddubsw m2, [r3] ; [16]
2755 pmulhrsw m2, m7
2756 packuswb m1, m2
2757
2758 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2759
2760 lea r0, [r6 + r1 * 4]
2761 lea r6, [r6 + r1 * 8]
2762 add r2, 8
2763 dec r4
2764 jnz .loop
2765
2766 RET
2767
2768INIT_XMM sse4
2769cglobal intra_pred_ang16_30, 3,7,8
2770 mov r2, r3mp
2771 lea r3, [ang_table + 16 * 16]
2772 mov r4d, 2
2773 lea r5, [r1 * 3] ; r5 -> 3 * stride
2774 mov r6, r0
2775 mova m7, [pw_1024]
2776
2777.loop:
2778 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2779 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2780 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2781 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2782
2783 pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
2784 pmulhrsw m4, m7
2785 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
2786 pmulhrsw m1, m7
2787 packuswb m4, m1
2788
2789 palignr m6, m2, m3, 2
2790
2791 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
2792 pmulhrsw m5, m7
2793 pmaddubsw m6, [r3 + 4 * 16] ; [20]
2794 pmulhrsw m6, m7
2795 packuswb m5, m6
2796
2797 palignr m1, m2, m3, 4
2798
2799 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
2800 pmulhrsw m6, m7
2801 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2802 pmulhrsw m0, m7
2803 packuswb m6, m0
2804
2805 palignr m0, m2, m3, 6
2806
2807 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2808 pmulhrsw m1, m7
2809 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2810 pmulhrsw m0, m7
2811 packuswb m1, m0
2812
2813 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2814
2815 palignr m4, m2, m3, 6
2816 palignr m6, m2, m3, 8
2817
2818 pmaddubsw m4, [r3 + 5 * 16] ; [21]
2819 pmulhrsw m4, m7
2820 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
2821 pmulhrsw m1, m7
2822 packuswb m4, m1
2823
2824 pmaddubsw m5, m6, [r3 - 16] ; [15]
2825 pmulhrsw m5, m7
2826 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2827 pmulhrsw m6, m7
2828 packuswb m5, m6
2829
2830 palignr m0, m2, m3, 10
2831
2832 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
2833 pmulhrsw m6, m7
2834 pmaddubsw m0, [r3 + 6 * 16] ; [22]
2835 pmulhrsw m0, m7
2836 packuswb m6, m0
2837
2838 palignr m2, m3, 12
2839
2840 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
2841 pmulhrsw m1, m7
2842 pmaddubsw m2, [r3] ; [16]
2843 pmulhrsw m2, m7
2844 packuswb m1, m2
2845
2846 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
2847
2848 lea r0, [r6 + 8]
2849 add r2, 8
2850 dec r4
2851 jnz .loop
2852
2853 RET
2854
2855INIT_XMM sse4
2856cglobal intra_pred_ang16_7, 3,7,8
2857
2858 lea r3, [ang_table + 16 * 16]
2859 mov r4d, 2
2860 lea r5, [r1 * 3] ; r5 -> 3 * stride
2861 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2862 mova m7, [pw_1024]
2863
2864.loop:
2865 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2866 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2867 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2868 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2869
2870 pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
2871 pmulhrsw m4, m7
2872 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
2873 pmulhrsw m0, m7
2874 packuswb m4, m0
2875
2876 palignr m1, m2, m3, 2
2877
2878 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
2879 pmulhrsw m5, m7
2880 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
2881 pmulhrsw m6, m7
2882 packuswb m5, m6
2883
2884 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
2885 pmulhrsw m6, m7
2886 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
2887 pmulhrsw m0, m7
2888 packuswb m6, m0
2889
2890 palignr m0, m2, m3, 4
2891
2892 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2893 pmulhrsw m1, m7
2894 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2895 pmulhrsw m0, m7
2896 packuswb m1, m0
2897
2898 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2899
2900 palignr m1, m2, m3, 4
2901
2902 pmaddubsw m4, m1, [r3 + 16] ; [17]
2903 pmulhrsw m4, m7
2904 pmaddubsw m1, [r3 + 10 * 16] ; [26]
2905 pmulhrsw m1, m7
2906 packuswb m4, m1
2907
2908 palignr m0, m2, m3, 6
2909
2910 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
2911 pmulhrsw m5, m7
2912 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
2913 pmulhrsw m6, m7
2914 packuswb m5, m6
2915
2916 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
2917 pmulhrsw m6, m7
2918 pmaddubsw m0, [r3 + 14 * 16] ; [30]
2919 pmulhrsw m0, m7
2920 packuswb m6, m0
2921
2922 palignr m2, m3, 8
2923
2924 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
2925 pmulhrsw m1, m7
2926 pmaddubsw m2, [r3] ; [16]
2927 pmulhrsw m2, m7
2928 packuswb m1, m2
2929
2930 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2931
2932 lea r0, [r6 + r1 * 4]
2933 lea r6, [r6 + r1 * 8]
2934 add r2, 8
2935 dec r4
2936 jnz .loop
2937
2938 RET
2939
2940INIT_XMM sse4
2941cglobal intra_pred_ang16_29, 3,7,8
2942 mov r2, r3mp
2943 lea r3, [ang_table + 16 * 16]
2944 mov r4d, 2
2945 lea r5, [r1 * 3] ; r5 -> 3 * stride
2946 mov r6, r0
2947 mova m7, [pw_1024]
2948
2949.loop:
2950 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2951 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2952 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2953 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2954
2955 pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
2956 pmulhrsw m4, m7
2957 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
2958 pmulhrsw m0, m7
2959 packuswb m4, m0
2960
2961 palignr m1, m2, m3, 2
2962
2963 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
2964 pmulhrsw m5, m7
2965 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
2966 pmulhrsw m6, m7
2967 packuswb m5, m6
2968
2969 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
2970 pmulhrsw m6, m7
2971 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
2972 pmulhrsw m0, m7
2973 packuswb m6, m0
2974
2975 palignr m0, m2, m3, 4
2976
2977 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2978 pmulhrsw m1, m7
2979 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2980 pmulhrsw m0, m7
2981 packuswb m1, m0
2982
2983 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2984
2985 palignr m1, m2, m3, 4
2986
2987 pmaddubsw m4, m1, [r3 + 16] ; [17]
2988 pmulhrsw m4, m7
2989 pmaddubsw m1, [r3 + 10 * 16] ; [26]
2990 pmulhrsw m1, m7
2991 packuswb m4, m1
2992
2993 palignr m0, m2, m3, 6
2994
2995 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
2996 pmulhrsw m5, m7
2997 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
2998 pmulhrsw m6, m7
2999 packuswb m5, m6
3000
3001 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
3002 pmulhrsw m6, m7
3003 pmaddubsw m0, [r3 + 14 * 16] ; [30]
3004 pmulhrsw m0, m7
3005 packuswb m6, m0
3006
3007 palignr m2, m3, 8
3008
3009 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
3010 pmulhrsw m1, m7
3011 pmaddubsw m2, [r3] ; [16]
3012 pmulhrsw m2, m7
3013 packuswb m1, m2
3014
3015 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3016
3017 lea r0, [r6 + 8]
3018 add r2, 8
3019 dec r4
3020 jnz .loop
3021
3022 RET
3023
3024INIT_XMM sse4
3025cglobal intra_pred_ang16_8, 3,7,8
3026
3027 lea r3, [ang_table + 16 * 16]
3028 mov r4d, 2
3029 lea r5, [r1 * 3] ; r5 -> 3 * stride
3030 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3031 mova m7, [pw_1024]
3032
3033.loop:
3034 movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3035 palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3036 punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3037 punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3038
3039 pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
3040 pmulhrsw m4, m7
3041 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
3042 pmulhrsw m2, m7
3043 packuswb m4, m2
3044
3045 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
3046 pmulhrsw m5, m7
3047 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
3048 pmulhrsw m6, m7
3049 packuswb m5, m6
3050
3051 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
3052 pmulhrsw m6, m7
3053 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
3054 pmulhrsw m2, m7
3055 packuswb m6, m2
3056
3057 palignr m2, m0, m1, 2
3058 palignr m3, m0, m1, 4
3059
3060 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
3061 pmulhrsw m1, m7
3062 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
3063 pmulhrsw m0, m7
3064 packuswb m1, m0
3065
3066 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3067
3068 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
3069 pmulhrsw m4, m7
3070 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
3071 pmulhrsw m5, m7
3072 packuswb m4, m5
3073
3074 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
3075 pmulhrsw m5, m7
3076 pmaddubsw m2, [r3 + 12 * 16] ; [28]
3077 pmulhrsw m2, m7
3078 packuswb m5, m2
3079
3080 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
3081 pmulhrsw m6, m7
3082 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
3083 pmulhrsw m1, m7
3084 packuswb m6, m1
3085
3086 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
3087 pmulhrsw m1, m7
3088 pmaddubsw m3, [r3] ; [16]
3089 pmulhrsw m3, m7
3090 packuswb m1, m3
3091
3092 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3093
3094 lea r0, [r6 + r1 * 4]
3095 lea r6, [r6 + r1 * 8]
3096 add r2, 8
3097 dec r4
3098 jnz .loop
3099
3100 RET
3101
3102INIT_XMM sse4
3103cglobal intra_pred_ang16_28, 3,7,8
3104 mov r2, r3mp
3105 lea r3, [ang_table + 16 * 16]
3106 mov r4d, 2
3107 lea r5, [r1 * 3] ; r5 -> 3 * stride
3108 mov r6, r0
3109 mova m7, [pw_1024]
3110
3111.loop:
3112 movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3113 palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3114 punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3115 punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3116
3117 pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
3118 pmulhrsw m4, m7
3119 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
3120 pmulhrsw m2, m7
3121 packuswb m4, m2
3122
3123 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
3124 pmulhrsw m5, m7
3125 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
3126 pmulhrsw m6, m7
3127 packuswb m5, m6
3128
3129 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
3130 pmulhrsw m6, m7
3131 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
3132 pmulhrsw m2, m7
3133 packuswb m6, m2
3134
3135 palignr m2, m0, m1, 2
3136 palignr m3, m0, m1, 4
3137
3138 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
3139 pmulhrsw m1, m7
3140 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
3141 pmulhrsw m0, m7
3142 packuswb m1, m0
3143
3144 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3145
3146 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
3147 pmulhrsw m4, m7
3148 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
3149 pmulhrsw m5, m7
3150 packuswb m4, m5
3151
3152 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
3153 pmulhrsw m5, m7
3154 pmaddubsw m2, [r3 + 12 * 16] ; [28]
3155 pmulhrsw m2, m7
3156 packuswb m5, m2
3157
3158 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
3159 pmulhrsw m6, m7
3160 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
3161 pmulhrsw m1, m7
3162 packuswb m6, m1
3163
3164 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
3165 pmulhrsw m1, m7
3166 pmaddubsw m3, [r3] ; [16]
3167 pmulhrsw m3, m7
3168 packuswb m1, m3
3169
3170 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3171
3172 lea r0, [r6 + 8]
3173 add r2, 8
3174 dec r4
3175 jnz .loop
3176
3177 RET
3178
3179INIT_XMM sse4
3180cglobal intra_pred_ang16_9, 3,7,8
3181
3182 lea r3, [ang_table + 16 * 16]
3183 mov r4d, 2
3184 lea r5, [r1 * 3] ; r5 -> 3 * stride
3185 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3186 mova m7, [pw_1024]
3187
3188.loop:
3189 movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3190 palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3191 punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3192
3193 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
3194 pmulhrsw m4, m7
3195 pmaddubsw m0, m2, [r3 - 12 * 16] ; [4]
3196 pmulhrsw m0, m7
3197 packuswb m4, m0
3198
3199 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
3200 pmulhrsw m5, m7
3201 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
3202 pmulhrsw m6, m7
3203 packuswb m5, m6
3204
3205 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
3206 pmulhrsw m6, m7
3207 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
3208 pmulhrsw m0, m7
3209 packuswb m6, m0
3210
3211 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
3212 pmulhrsw m1, m7
3213 pmaddubsw m0, m2, [r3] ; [16]
3214 pmulhrsw m0, m7
3215 packuswb m1, m0
3216
3217 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3218
3219 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
3220 pmulhrsw m4, m7
3221 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
3222 pmulhrsw m5, m7
3223 packuswb m4, m5
3224
3225 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
3226 pmulhrsw m5, m7
3227 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
3228 pmulhrsw m6, m7
3229 packuswb m5, m6
3230
3231 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
3232 pmulhrsw m6, m7
3233 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
3234 pmulhrsw m1, m7
3235 packuswb m6, m1
3236
3237 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
3238 pmulhrsw m1, m7
3239 packuswb m1, m1
3240
3241 punpcklqdq m1, m3 ; [00]
3242
3243 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3244
3245 lea r0, [r6 + r1 * 4]
3246 lea r6, [r6 + r1 * 8]
3247 add r2, 8
3248 dec r4
3249 jnz .loop
3250
3251 RET
3252
3253INIT_XMM sse4
3254cglobal intra_pred_ang16_27, 3,7,8
3255 mov r2, r3mp
3256 lea r3, [ang_table + 16 * 16]
3257 mov r4d, 2
3258 lea r5, [r1 * 3] ; r5 -> 3 * stride
3259 mov r6, r0
3260 mova m7, [pw_1024]
3261
3262.loop:
3263 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3264 palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3265 punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3266
3267 pmaddubsw m4, m3, [r3 - 14 * 16] ; [2]
3268 pmulhrsw m4, m7
3269 pmaddubsw m0, m3, [r3 - 12 * 16] ; [4]
3270 pmulhrsw m0, m7
3271 packuswb m4, m0
3272
3273 pmaddubsw m5, m3, [r3 - 10 * 16] ; [6]
3274 pmulhrsw m5, m7
3275 pmaddubsw m6, m3, [r3 - 8 * 16] ; [8]
3276 pmulhrsw m6, m7
3277 packuswb m5, m6
3278
3279 pmaddubsw m6, m3, [r3 - 6 * 16] ; [10]
3280 pmulhrsw m6, m7
3281 pmaddubsw m0, m3, [r3 - 4 * 16] ; [12]
3282 pmulhrsw m0, m7
3283 packuswb m6, m0
3284
3285 pmaddubsw m1, m3, [r3 - 2 * 16] ; [14]
3286 pmulhrsw m1, m7
3287 pmaddubsw m0, m3, [r3] ; [16]
3288 pmulhrsw m0, m7
3289 packuswb m1, m0
3290
3291 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3292
3293 pmaddubsw m4, m3, [r3 + 2 * 16] ; [18]
3294 pmulhrsw m4, m7
3295 pmaddubsw m5, m3, [r3 + 4 * 16] ; [20]
3296 pmulhrsw m5, m7
3297 packuswb m4, m5
3298
3299 pmaddubsw m5, m3, [r3 + 6 * 16] ; [22]
3300 pmulhrsw m5, m7
3301 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
3302 pmulhrsw m6, m7
3303 packuswb m5, m6
3304
3305 pmaddubsw m6, m3, [r3 + 10 * 16] ; [26]
3306 pmulhrsw m6, m7
3307 pmaddubsw m1, m3, [r3 + 12 * 16] ; [28]
3308 pmulhrsw m1, m7
3309 packuswb m6, m1
3310
3311 pmaddubsw m1, m3, [r3 + 14 * 16] ; [30]
3312 pmulhrsw m1, m7
3313 packuswb m1, m1
3314
3315 movh [r0 ], m4
3316 movhps [r0 + r1 ], m4
3317 movh [r0 + r1 * 2], m5
3318 movhps [r0 + r5 ], m5
3319 lea r0, [r0 + r1 * 4]
3320 movh [r0 ], m6
3321 movhps [r0 + r1 ], m6
3322 movh [r0 + r1 * 2], m1
3323 movh [r0 + r5 ], m2
3324
3325 lea r0, [r6 + 8]
3326 add r2, 8
3327 dec r4
3328 jnz .loop
3329
3330 RET
3331
3332INIT_XMM sse4
3333cglobal intra_pred_ang16_10, 6,6,8
3334 lea r4, [r1 * 3]
3335 pxor m7, m7
3336
3337 movu m0, [r2 + 1]
3338 palignr m1, m0, 1
3339 pshufb m1, m7
3340 palignr m2, m0, 2
3341 pshufb m2, m7
3342 palignr m3, m0, 3
3343 pshufb m3, m7
3344 palignr m4, m0, 4
3345 pshufb m4, m7
3346 palignr m5, m0, 5
3347 pshufb m5, m7
3348 palignr m6, m0, 6
3349 pshufb m6, m7
3350
3351 movu [r0 + r1], m1
3352 movu [r0 + r1 * 2], m2
3353 movu [r0 + r4], m3
3354 lea r2, [r0 + r1 * 4]
3355 movu [r2], m4
3356 movu [r2 + r1], m5
3357 movu [r2 + r1 * 2], m6
3358
3359 palignr m1, m0, 7
3360 pshufb m1, m7
3361 movhlps m2, m0
3362 pshufb m2, m7
3363 palignr m3, m0, 9
3364 pshufb m3, m7
3365 palignr m4, m0, 10
3366 pshufb m4, m7
3367 palignr m5, m0, 11
3368 pshufb m5, m7
3369 palignr m6, m0, 12
3370 pshufb m6, m7
3371
3372 movu [r2 + r4], m1
3373 lea r2, [r2 + r1 * 4]
3374 movu [r2], m2
3375 movu [r2 + r1], m3
3376 movu [r2 + r1 * 2], m4
3377 movu [r2 + r4], m5
3378 lea r2, [r2 + r1 * 4]
3379 movu [r2], m6
3380
3381 palignr m1, m0, 13
3382 pshufb m1, m7
3383 palignr m2, m0, 14
3384 pshufb m2, m7
3385 palignr m3, m0, 15
3386 pshufb m3, m7
3387 pshufb m0, m7
3388
3389 movu [r2 + r1], m1
3390 movu [r2 + r1 * 2], m2
3391 movu [r2 + r4], m3
3392
3393; filter
3394 cmp r5w, byte 0
3395 jz .quit
3396 pmovzxbw m0, m0
3397 mova m1, m0
3398 movu m2, [r3]
3399 movu m3, [r3 + 1]
3400
3401 pshufb m2, m7
3402 pmovzxbw m2, m2
3403 movhlps m4, m3
3404 pmovzxbw m3, m3
3405 pmovzxbw m4, m4
3406 psubw m3, m2
3407 psubw m4, m2
3408 psraw m3, 1
3409 psraw m4, 1
3410 paddw m0, m3
3411 paddw m1, m4
3412 packuswb m0, m1
3413
3414.quit:
3415 movu [r0], m0
3416
3417 RET
3418
3419INIT_XMM sse4
3420%if ARCH_X86_64 == 1
3421cglobal intra_pred_ang16_26, 4,8,5
3422 mov r7, r5mp
3423 %define bfilter r7w
3424%else
3425 cglobal intra_pred_ang16_26, 6,7,5,0 - 4
3426 %define bfilter dword[rsp]
3427 mov bfilter, r5
3428%endif
3429 movu m0, [r3 + 1]
3430
3431 lea r4, [r1 * 3]
3432 lea r3, [r0 + r1 * 4]
3433 lea r5, [r3 + r1 * 4]
3434 lea r6, [r5 + r1 * 4]
3435
3436 movu [r0], m0
3437 movu [r0 + r1], m0
3438 movu [r0 + r1 * 2], m0
3439 movu [r0 + r4], m0
3440 movu [r3], m0
3441 movu [r3 + r1], m0
3442 movu [r3 + r1 * 2], m0
3443 movu [r3 + r4], m0
3444 movu [r5], m0
3445 movu [r5 + r1], m0
3446 movu [r5 + r1 * 2], m0
3447 movu [r5 + r4], m0
3448
3449 movu [r6], m0
3450 movu [r6 + r1], m0
3451 movu [r6 + r1 * 2], m0
3452 movu [r6 + r4], m0
3453
3454; filter
3455 cmp bfilter, byte 0
3456 jz .quit
3457
3458 pxor m4, m4
3459 pshufb m0, m4
3460 pmovzxbw m0, m0
3461 mova m1, m0
3462 movu m2, [r2]
3463 movu m3, [r2 + 1]
3464
3465 pshufb m2, m4
3466 pmovzxbw m2, m2
3467 movhlps m4, m3
3468 pmovzxbw m3, m3
3469 pmovzxbw m4, m4
3470 psubw m3, m2
3471 psubw m4, m2
3472 psraw m3, 1
3473 psraw m4, 1
3474 paddw m0, m3
3475 paddw m1, m4
3476 packuswb m0, m1
3477
3478 pextrb [r0], m0, 0
3479 pextrb [r0 + r1], m0, 1
3480 pextrb [r0 + r1 * 2], m0, 2
3481 pextrb [r0 + r4], m0, 3
3482 pextrb [r3], m0, 4
3483 pextrb [r3 + r1], m0, 5
3484 pextrb [r3 + r1 * 2], m0, 6
3485 pextrb [r3 + r4], m0, 7
3486 pextrb [r5], m0, 8
3487 pextrb [r5 + r1], m0, 9
3488 pextrb [r5 + r1 * 2], m0, 10
3489 pextrb [r5 + r4], m0, 11
3490 pextrb [r6], m0, 12
3491 pextrb [r6 + r1], m0, 13
3492 pextrb [r6 + r1 * 2], m0, 14
3493 pextrb [r6 + r4], m0, 15
3494
3495.quit:
3496 RET
3497
3498INIT_XMM sse4
3499cglobal intra_pred_ang16_11, 3,7,8
3500
3501 lea r3, [ang_table + 16 * 16]
3502 mov r4d, 2
3503 lea r5, [r1 * 3] ; r5 -> 3 * stride
3504 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3505 mova m7, [pw_1024]
3506
3507.loop:
3508 movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3509 mova m2, m3
3510 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3511 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3512
3513 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
3514 pmulhrsw m4, m7
3515 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
3516 pmulhrsw m0, m7
3517 packuswb m4, m0
3518
3519 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
3520 pmulhrsw m5, m7
3521 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
3522 pmulhrsw m6, m7
3523 packuswb m5, m6
3524
3525 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
3526 pmulhrsw m6, m7
3527 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
3528 pmulhrsw m0, m7
3529 packuswb m6, m0
3530
3531 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
3532 pmulhrsw m1, m7
3533 pmaddubsw m0, m3, [r3] ; [16]
3534 pmulhrsw m0, m7
3535 packuswb m1, m0
3536
3537 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3538
3539 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
3540 pmulhrsw m4, m7
3541 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
3542 pmulhrsw m5, m7
3543 packuswb m4, m5
3544
3545 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
3546 pmulhrsw m5, m7
3547 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
3548 pmulhrsw m6, m7
3549 packuswb m5, m6
3550
3551 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
3552 pmulhrsw m6, m7
3553 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
3554 pmulhrsw m1, m7
3555 packuswb m6, m1
3556
3557 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
3558 pmulhrsw m1, m7
3559 packuswb m1, m1
3560 punpcklqdq m1, m2 ;[00]
3561
3562 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3563
3564 lea r0, [r6 + r1 * 4]
3565 lea r6, [r6 + r1 * 8]
3566 add r2, 8
3567 dec r4
3568 jnz .loop
3569
3570 RET
3571
3572INIT_XMM sse4
3573cglobal intra_pred_ang16_25, 3,7,8
3574 mov r2, r3mp
3575 lea r3, [ang_table + 16 * 16]
3576 mov r4d, 2
3577 lea r5, [r1 * 3] ; r5 -> 3 * stride
3578 mov r6, r0
3579 mova m7, [pw_1024]
3580
3581.loop:
3582 movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3583 mova m2, m3
3584 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3585 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3586
3587 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
3588 pmulhrsw m4, m7
3589 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
3590 pmulhrsw m0, m7
3591 packuswb m4, m0
3592
3593 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
3594 pmulhrsw m5, m7
3595 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
3596 pmulhrsw m6, m7
3597 packuswb m5, m6
3598
3599 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
3600 pmulhrsw m6, m7
3601 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
3602 pmulhrsw m0, m7
3603 packuswb m6, m0
3604
3605 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
3606 pmulhrsw m1, m7
3607 pmaddubsw m0, m3, [r3] ; [16]
3608 pmulhrsw m0, m7
3609 packuswb m1, m0
3610
3611 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3612
3613 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
3614 pmulhrsw m4, m7
3615 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
3616 pmulhrsw m5, m7
3617 packuswb m4, m5
3618
3619 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
3620 pmulhrsw m5, m7
3621 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
3622 pmulhrsw m6, m7
3623 packuswb m5, m6
3624
3625 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
3626 pmulhrsw m6, m7
3627 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
3628 pmulhrsw m1, m7
3629 packuswb m6, m1
3630
3631 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
3632 pmulhrsw m1, m7
3633 packuswb m1, m1
3634
3635 movh [r0 ], m4
3636 movhps [r0 + r1 ], m4
3637 movh [r0 + r1 * 2], m5
3638 movhps [r0 + r5 ], m5
3639 lea r0, [r0 + r1 * 4]
3640 movh [r0 ], m6
3641 movhps [r0 + r1 ], m6
3642 movh [r0 + r1 * 2], m1
3643 movh [r0 + r5 ], m2
3644
3645 lea r0, [r6 + 8]
3646 add r2, 8
3647 dec r4
3648 jnz .loop
3649
3650 RET
3651
3652INIT_XMM sse4
3653cglobal intra_pred_ang16_12, 4,7,8
3654
3655 lea r4, [ang_table + 16 * 16]
3656 lea r5, [r1 * 3] ; r5 -> 3 * stride
3657 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3658 mova m7, [pw_1024]
3659
3660 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3661 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
3662 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
3663 movu m2, [r3]
3664 pshufb m2, [c_mode16_12]
3665
3666 palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3667
3668 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
3669 pmulhrsw m4, m7
3670 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
3671 pmulhrsw m1, m7
3672 packuswb m4, m1
3673
3674 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
3675 pmulhrsw m5, m7
3676 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
3677 pmulhrsw m6, m7
3678 packuswb m5, m6
3679
3680 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
3681 pmulhrsw m6, m7
3682 pmaddubsw m0, [r4 - 14 * 16] ; [2]
3683 pmulhrsw m0, m7
3684 packuswb m6, m0
3685
3686 palignr m3, m2, 15
3687
3688 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3689 pmulhrsw m1, m7
3690 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3691 pmulhrsw m0, m7
3692 packuswb m1, m0
3693
3694 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3695
3696 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3697 pmulhrsw m4, m7
3698 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3699 pmulhrsw m5, m7
3700 packuswb m4, m5
3701
3702 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3703 pmulhrsw m5, m7
3704 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3705 pmulhrsw m6, m7
3706 packuswb m5, m6
3707
3708 palignr m3, m2, 14
3709
3710 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3711 pmulhrsw m6, m7
3712 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3713 pmulhrsw m1, m7
3714 packuswb m6, m1
3715
3716 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3717 pmulhrsw m1, m7
3718 pmaddubsw m3, [r4] ; [16]
3719 pmulhrsw m3, m7
3720 packuswb m1, m3
3721
3722 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3723
3724 lea r0, [r6 + r1 * 4]
3725 lea r6, [r6 + r1 * 8]
3726
3727 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3728 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
3729 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
3730 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
3731
3732 pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
3733 pmulhrsw m4, m7
3734 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
3735 pmulhrsw m5, m7
3736 packuswb m4, m5
3737
3738 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
3739 pmulhrsw m5, m7
3740 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
3741 pmulhrsw m6, m7
3742 packuswb m5, m6
3743
3744 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
3745 pmulhrsw m6, m7
3746 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
3747 pmulhrsw m0, m7
3748 packuswb m6, m0
3749
3750 palignr m3, m2, 14
3751
3752 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3753 pmulhrsw m1, m7
3754 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3755 pmulhrsw m0, m7
3756 packuswb m1, m0
3757
3758 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3759
3760 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3761 pmulhrsw m4, m7
3762 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3763 pmulhrsw m5, m7
3764 packuswb m4, m5
3765
3766 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3767 pmulhrsw m5, m7
3768 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3769 pmulhrsw m6, m7
3770 packuswb m5, m6
3771
3772 pslldq m2, 1
3773 palignr m3, m2, 14
3774
3775 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3776 pmulhrsw m6, m7
3777 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3778 pmulhrsw m1, m7
3779 packuswb m6, m1
3780
3781 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3782 pmulhrsw m1, m7
3783 pmaddubsw m3, [r4] ; [16]
3784 pmulhrsw m3, m7
3785 packuswb m1, m3
3786
3787 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3788
3789 RET
3790
3791INIT_XMM sse4
3792cglobal intra_pred_ang16_24, 4,7,8
3793
3794 lea r4, [ang_table + 16 * 16]
3795 lea r5, [r1 * 3] ; r5 -> 3 * stride
3796 mov r6, r0
3797 mova m7, [pw_1024]
3798
3799 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3800 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
3801 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
3802 movu m2, [r2]
3803 pshufb m2, [c_mode16_12]
3804
3805 palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3806
3807 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
3808 pmulhrsw m4, m7
3809 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
3810 pmulhrsw m1, m7
3811 packuswb m4, m1
3812
3813 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
3814 pmulhrsw m5, m7
3815 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
3816 pmulhrsw m6, m7
3817 packuswb m5, m6
3818
3819 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
3820 pmulhrsw m6, m7
3821 pmaddubsw m0, [r4 - 14 * 16] ; [2]
3822 pmulhrsw m0, m7
3823 packuswb m6, m0
3824
3825 palignr m3, m2, 15
3826
3827 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3828 pmulhrsw m1, m7
3829 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3830 pmulhrsw m0, m7
3831 packuswb m1, m0
3832
3833 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3834
3835 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3836 pmulhrsw m4, m7
3837 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3838 pmulhrsw m5, m7
3839 packuswb m4, m5
3840
3841 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3842 pmulhrsw m5, m7
3843 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3844 pmulhrsw m6, m7
3845 packuswb m5, m6
3846
3847 palignr m3, m2, 14
3848
3849 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3850 pmulhrsw m6, m7
3851 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3852 pmulhrsw m1, m7
3853 packuswb m6, m1
3854
3855 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3856 pmulhrsw m1, m7
3857 pmaddubsw m3, [r4] ; [16]
3858 pmulhrsw m3, m7
3859 packuswb m1, m3
3860
3861 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3862
3863 lea r0, [r6 + 8]
3864
3865 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3866 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
3867 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
3868 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
3869
3870 pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
3871 pmulhrsw m4, m7
3872 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
3873 pmulhrsw m5, m7
3874 packuswb m4, m5
3875
3876 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
3877 pmulhrsw m5, m7
3878 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
3879 pmulhrsw m6, m7
3880 packuswb m5, m6
3881
3882 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
3883 pmulhrsw m6, m7
3884 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
3885 pmulhrsw m0, m7
3886 packuswb m6, m0
3887
3888 palignr m3, m2, 14
3889
3890 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3891 pmulhrsw m1, m7
3892 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3893 pmulhrsw m0, m7
3894 packuswb m1, m0
3895
3896 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3897
3898 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3899 pmulhrsw m4, m7
3900 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3901 pmulhrsw m5, m7
3902 packuswb m4, m5
3903
3904 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3905 pmulhrsw m5, m7
3906 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3907 pmulhrsw m6, m7
3908 packuswb m5, m6
3909
3910 pslldq m2, 1
3911 palignr m3, m2, 14
3912
3913 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3914 pmulhrsw m6, m7
3915 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3916 pmulhrsw m1, m7
3917 packuswb m6, m1
3918
3919 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3920 pmulhrsw m1, m7
3921 pmaddubsw m3, [r4] ; [16]
3922 pmulhrsw m3, m7
3923 packuswb m1, m3
3924
3925 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3926
3927 RET
3928
3929INIT_XMM sse4
3930cglobal intra_pred_ang16_13, 4,7,8
3931
3932 lea r4, [ang_table + 16 * 16]
3933 lea r5, [r1 * 3] ; r5 -> 3 * stride
3934 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3935 mova m7, [pw_1024]
3936
3937 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3938 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
3939 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
3940 movu m2, [r3]
3941 pshufb m2, [c_mode16_13]
3942
3943 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3944
3945 pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
3946 pmulhrsw m4, m7
3947 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
3948 pmulhrsw m0, m7
3949 packuswb m4, m0
3950
3951 pmaddubsw m5, [r4 - 11 * 16] ; [05]
3952 pmulhrsw m5, m7
3953
3954 palignr m3, m2, 15
3955
3956 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
3957 pmulhrsw m6, m7
3958 packuswb m5, m6
3959
3960 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
3961 pmulhrsw m6, m7
3962 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
3963 pmulhrsw m0, m7
3964 packuswb m6, m0
3965
3966 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
3967 pmulhrsw m1, m7
3968
3969 palignr m3, m2, 14
3970
3971 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3972 pmulhrsw m0, m7
3973 packuswb m1, m0
3974
3975 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3976
3977 pmaddubsw m4, m3, [r4 - 16] ; [15]
3978 pmulhrsw m4, m7
3979 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
3980 pmulhrsw m5, m7
3981 packuswb m4, m5
3982
3983 pslldq m2, 1
3984 palignr m3, m2, 14
3985
3986 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
3987 pmulhrsw m5, m7
3988 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
3989 pmulhrsw m6, m7
3990 packuswb m5, m6
3991
3992 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
3993 pmulhrsw m6, m7
3994 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
3995 pmulhrsw m1, m7
3996 packuswb m6, m1
3997
3998 pslldq m2, 1
3999 palignr m3, m2, 14
4000
4001 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4002 pmulhrsw m1, m7
4003 pmaddubsw m3, [r4] ; [16]
4004 pmulhrsw m3, m7
4005 packuswb m1, m3
4006
4007 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4008
4009 lea r0, [r6 + r1 * 4]
4010 lea r6, [r6 + r1 * 8]
4011
4012 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4013 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4014 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4015 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4016
4017 pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
4018 pmulhrsw m4, m7
4019 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
4020 pmulhrsw m5, m7
4021 packuswb m4, m5
4022
4023 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4024 pmulhrsw m5, m7
4025
4026 palignr m3, m2, 14
4027
4028 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4029 pmulhrsw m6, m7
4030 packuswb m5, m6
4031
4032 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
4033 pmulhrsw m6, m7
4034 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
4035 pmulhrsw m0, m7
4036 packuswb m6, m0
4037
4038 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4039 pmulhrsw m1, m7
4040
4041 pslldq m2, 1
4042 palignr m3, m2, 14
4043
4044 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4045 pmulhrsw m0, m7
4046 packuswb m1, m0
4047
4048 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4049
4050 pmaddubsw m4, m3, [r4 - 16] ; [15]
4051 pmulhrsw m4, m7
4052 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4053 pmulhrsw m5, m7
4054 packuswb m4, m5
4055
4056 pslldq m2, 1
4057 palignr m3, m2, 14
4058
4059 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
4060 pmulhrsw m5, m7
4061 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4062 pmulhrsw m6, m7
4063 packuswb m5, m6
4064
4065 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4066 pmulhrsw m6, m7
4067 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
4068 pmulhrsw m1, m7
4069 packuswb m6, m1
4070
4071 pslldq m2, 1
4072 palignr m3, m2, 14
4073
4074 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4075 pmulhrsw m1, m7
4076 pmaddubsw m3, [r4] ; [16]
4077 pmulhrsw m3, m7
4078 packuswb m1, m3
4079
4080 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4081
4082 RET
4083
4084INIT_XMM sse4
4085cglobal intra_pred_ang16_23, 4,7,8
4086
4087 lea r4, [ang_table + 16 * 16]
4088 lea r5, [r1 * 3] ; r5 -> 3 * stride
4089 mov r6, r0
4090 mova m7, [pw_1024]
4091
4092 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4093 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4094 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4095 movu m2, [r2]
4096 pshufb m2, [c_mode16_13]
4097
4098 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4099
4100 pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
4101 pmulhrsw m4, m7
4102 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
4103 pmulhrsw m0, m7
4104 packuswb m4, m0
4105
4106 pmaddubsw m5, [r4 - 11 * 16] ; [05]
4107 pmulhrsw m5, m7
4108
4109 palignr m3, m2, 15
4110
4111 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4112 pmulhrsw m6, m7
4113 packuswb m5, m6
4114
4115 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
4116 pmulhrsw m6, m7
4117 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
4118 pmulhrsw m0, m7
4119 packuswb m6, m0
4120
4121 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4122 pmulhrsw m1, m7
4123
4124 palignr m3, m2, 14
4125
4126 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4127 pmulhrsw m0, m7
4128 packuswb m1, m0
4129
4130 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4131
4132 pmaddubsw m4, m3, [r4 - 16] ; [15]
4133 pmulhrsw m4, m7
4134 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4135 pmulhrsw m5, m7
4136 packuswb m4, m5
4137
4138 pslldq m2, 1
4139 palignr m3, m2, 14
4140
4141 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
4142 pmulhrsw m5, m7
4143 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4144 pmulhrsw m6, m7
4145 packuswb m5, m6
4146
4147 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4148 pmulhrsw m6, m7
4149 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
4150 pmulhrsw m1, m7
4151 packuswb m6, m1
4152
4153 pslldq m2, 1
4154 palignr m3, m2, 14
4155
4156 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4157 pmulhrsw m1, m7
4158 pmaddubsw m3, [r4] ; [16]
4159 pmulhrsw m3, m7
4160 packuswb m1, m3
4161
4162 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4163
4164 lea r0, [r6 + 8]
4165
4166 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4167 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4168 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4169 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4170
4171 pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
4172 pmulhrsw m4, m7
4173 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
4174 pmulhrsw m5, m7
4175 packuswb m4, m5
4176
4177 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4178 pmulhrsw m5, m7
4179
4180 palignr m3, m2, 14
4181
4182 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4183 pmulhrsw m6, m7
4184 packuswb m5, m6
4185
4186 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
4187 pmulhrsw m6, m7
4188 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
4189 pmulhrsw m0, m7
4190 packuswb m6, m0
4191
4192 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4193 pmulhrsw m1, m7
4194
4195 pslldq m2, 1
4196 palignr m3, m2, 14
4197
4198 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4199 pmulhrsw m0, m7
4200 packuswb m1, m0
4201
4202 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4203
4204 pmaddubsw m4, m3, [r4 - 16] ; [15]
4205 pmulhrsw m4, m7
4206 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4207 pmulhrsw m5, m7
4208 packuswb m4, m5
4209
4210 pslldq m2, 1
4211 palignr m3, m2, 14
4212
4213 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
4214 pmulhrsw m5, m7
4215 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4216 pmulhrsw m6, m7
4217 packuswb m5, m6
4218
4219 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4220 pmulhrsw m6, m7
4221 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
4222 pmulhrsw m1, m7
4223 packuswb m6, m1
4224
4225 pslldq m2, 1
4226 palignr m3, m2, 14
4227
4228 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4229 pmulhrsw m1, m7
4230 pmaddubsw m3, [r4] ; [16]
4231 pmulhrsw m3, m7
4232 packuswb m1, m3
4233
4234 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4235
4236 RET
4237
4238INIT_XMM sse4
4239cglobal intra_pred_ang16_14, 4,7,8
4240
4241 lea r4, [ang_table + 16 * 16]
4242 lea r5, [r1 * 3] ; r5 -> 3 * stride
4243 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4244 mova m7, [pw_1024]
4245
4246 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4247 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4248 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4249 movu m2, [r3]
4250 pshufb m2, [c_mode16_14]
4251
4252 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4253
4254 pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
4255 pmulhrsw m4, m7
4256 pmaddubsw m5, [r4 - 10 * 16] ; [06]
4257 pmulhrsw m5, m7
4258 packuswb m4, m5
4259
4260 palignr m3, m2, 15
4261
4262 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4263 pmulhrsw m5, m7
4264 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4265 pmulhrsw m6, m7
4266 packuswb m5, m6
4267
4268 palignr m3, m2, 14
4269
4270 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4271 pmulhrsw m6, m7
4272 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4273 pmulhrsw m0, m7
4274 packuswb m6, m0
4275
4276 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4277 pmulhrsw m1, m7
4278
4279 pslldq m2, 1
4280 palignr m3, m2, 14
4281
4282 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4283 pmulhrsw m0, m7
4284 packuswb m1, m0
4285
4286 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4287
4288 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4289 pmulhrsw m4, m7
4290
4291 pslldq m2, 1
4292 palignr m3, m2, 14
4293
4294 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4295 pmulhrsw m5, m7
4296 packuswb m4, m5
4297
4298 pmaddubsw m5, m3, [r4 + 16] ; [17]
4299 pmulhrsw m5, m7
4300 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4301 pmulhrsw m6, m7
4302 packuswb m5, m6
4303
4304 pslldq m2, 1
4305 palignr m3, m2, 14
4306
4307 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4308 pmulhrsw m6, m7
4309 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4310 pmulhrsw m1, m7
4311 packuswb m6, m1
4312
4313 pslldq m2, 1
4314 palignr m3, m2, 14
4315
4316 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4317 pmulhrsw m1, m7
4318 pmaddubsw m3, [r4] ; [16]
4319 pmulhrsw m3, m7
4320 packuswb m1, m3
4321
4322 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4323
4324 lea r0, [r6 + r1 * 4]
4325 lea r6, [r6 + r1 * 8]
4326
4327 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4328 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4329 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4330 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4331
4332 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
4333 pmulhrsw m4, m7
4334 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4335 pmulhrsw m5, m7
4336 packuswb m4, m5
4337
4338 palignr m3, m2, 14
4339
4340 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4341 pmulhrsw m5, m7
4342 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4343 pmulhrsw m6, m7
4344 packuswb m5, m6
4345
4346 pslldq m2, 1
4347 palignr m3, m2, 14
4348
4349 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4350 pmulhrsw m6, m7
4351 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4352 pmulhrsw m0, m7
4353 packuswb m6, m0
4354
4355 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4356 pmulhrsw m1, m7
4357
4358 pslldq m2, 1
4359 palignr m3, m2, 14
4360
4361 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4362 pmulhrsw m0, m7
4363 packuswb m1, m0
4364
4365 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4366
4367 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4368 pmulhrsw m4, m7
4369
4370 pslldq m2, 1
4371 palignr m3, m2, 14
4372
4373 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4374 pmulhrsw m5, m7
4375 packuswb m4, m5
4376
4377 pmaddubsw m5, m3, [r4 + 16] ; [17]
4378 pmulhrsw m5, m7
4379 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4380 pmulhrsw m6, m7
4381 packuswb m5, m6
4382
4383 pslldq m2, 1
4384 palignr m3, m2, 14
4385
4386 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4387 pmulhrsw m6, m7
4388 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4389 pmulhrsw m1, m7
4390 packuswb m6, m1
4391
4392 pslldq m2, 1
4393 palignr m3, m2, 14
4394
4395 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4396 pmulhrsw m1, m7
4397 pmaddubsw m3, [r4] ; [16]
4398 pmulhrsw m3, m7
4399 packuswb m1, m3
4400
4401 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4402
4403 RET
4404
4405INIT_XMM sse4
4406cglobal intra_pred_ang16_22, 4,7,8
4407
4408 lea r4, [ang_table + 16 * 16]
4409 lea r5, [r1 * 3] ; r5 -> 3 * stride
4410 mov r6, r0
4411 mova m7, [pw_1024]
4412
4413 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4414 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4415 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4416 movu m2, [r2]
4417 pshufb m2, [c_mode16_14]
4418
4419 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4420
4421 pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
4422 pmulhrsw m4, m7
4423 pmaddubsw m5, [r4 - 10 * 16] ; [06]
4424 pmulhrsw m5, m7
4425 packuswb m4, m5
4426
4427 palignr m3, m2, 15
4428
4429 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4430 pmulhrsw m5, m7
4431 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4432 pmulhrsw m6, m7
4433 packuswb m5, m6
4434
4435 palignr m3, m2, 14
4436
4437 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4438 pmulhrsw m6, m7
4439 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4440 pmulhrsw m0, m7
4441 packuswb m6, m0
4442
4443 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4444 pmulhrsw m1, m7
4445
4446 pslldq m2, 1
4447 palignr m3, m2, 14
4448
4449 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4450 pmulhrsw m0, m7
4451 packuswb m1, m0
4452
4453 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4454
4455 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4456 pmulhrsw m4, m7
4457
4458 pslldq m2, 1
4459 palignr m3, m2, 14
4460
4461 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4462 pmulhrsw m5, m7
4463 packuswb m4, m5
4464
4465 pmaddubsw m5, m3, [r4 + 16] ; [17]
4466 pmulhrsw m5, m7
4467 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4468 pmulhrsw m6, m7
4469 packuswb m5, m6
4470
4471 pslldq m2, 1
4472 palignr m3, m2, 14
4473
4474 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4475 pmulhrsw m6, m7
4476 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4477 pmulhrsw m1, m7
4478 packuswb m6, m1
4479
4480 pslldq m2, 1
4481 palignr m3, m2, 14
4482
4483 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4484 pmulhrsw m1, m7
4485 pmaddubsw m3, [r4] ; [16]
4486 pmulhrsw m3, m7
4487 packuswb m1, m3
4488
4489 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4490
4491 lea r0, [r6 + 8]
4492
4493 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4494 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4495 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4496 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4497
4498 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
4499 pmulhrsw m4, m7
4500 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4501 pmulhrsw m5, m7
4502 packuswb m4, m5
4503
4504 palignr m3, m2, 14
4505
4506 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4507 pmulhrsw m5, m7
4508 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4509 pmulhrsw m6, m7
4510 packuswb m5, m6
4511
4512 pslldq m2, 1
4513 palignr m3, m2, 14
4514
4515 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4516 pmulhrsw m6, m7
4517 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4518 pmulhrsw m0, m7
4519 packuswb m6, m0
4520
4521 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4522 pmulhrsw m1, m7
4523
4524 pslldq m2, 1
4525 palignr m3, m2, 14
4526
4527 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4528 pmulhrsw m0, m7
4529 packuswb m1, m0
4530
4531 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4532
4533 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4534 pmulhrsw m4, m7
4535
4536 pslldq m2, 1
4537 palignr m3, m2, 14
4538
4539 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4540 pmulhrsw m5, m7
4541 packuswb m4, m5
4542
4543 pmaddubsw m5, m3, [r4 + 16] ; [17]
4544 pmulhrsw m5, m7
4545 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4546 pmulhrsw m6, m7
4547 packuswb m5, m6
4548
4549 pslldq m2, 1
4550 palignr m3, m2, 14
4551
4552 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4553 pmulhrsw m6, m7
4554 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4555 pmulhrsw m1, m7
4556 packuswb m6, m1
4557
4558 pslldq m2, 1
4559 palignr m3, m2, 14
4560
4561 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4562 pmulhrsw m1, m7
4563 pmaddubsw m3, [r4] ; [16]
4564 pmulhrsw m3, m7
4565 packuswb m1, m3
4566
4567 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4568
4569 RET
4570
4571INIT_XMM sse4
4572cglobal intra_pred_ang16_15, 4,7,8
4573
4574 lea r4, [ang_table + 16 * 16]
4575 lea r5, [r1 * 3] ; r5 -> 3 * stride
4576 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4577 mova m7, [pw_1024]
4578
4579 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4580 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4581 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4582 movu m2, [r3]
4583 pshufb m2, [c_mode16_15]
4584
4585 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4586
4587 pmaddubsw m4, [r4 - 16] ; [15]
4588 pmulhrsw m4, m7
4589
4590 palignr m3, m2, 15
4591
4592 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4593 pmulhrsw m5, m7
4594 packuswb m4, m5
4595
4596 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4597 pmulhrsw m5, m7
4598
4599 palignr m3, m2, 14
4600
4601 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4602 pmulhrsw m6, m7
4603 packuswb m5, m6
4604
4605 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4606 pmulhrsw m6, m7
4607
4608 pslldq m2, 1
4609 palignr m3, m2, 14
4610
4611 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4612 pmulhrsw m0, m7
4613 packuswb m6, m0
4614
4615 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4616 pmulhrsw m1, m7
4617
4618 pslldq m2, 1
4619 palignr m3, m2, 14
4620
4621 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4622 pmulhrsw m0, m7
4623 packuswb m1, m0
4624
4625 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4626
4627 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4628 pmulhrsw m4, m7
4629
4630 pslldq m2, 1
4631 palignr m3, m2, 14
4632
4633 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4634 pmulhrsw m5, m7
4635 packuswb m4, m5
4636
4637 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4638 pmulhrsw m5, m7
4639
4640 pslldq m2, 1
4641 palignr m3, m2, 14
4642
4643 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4644 pmulhrsw m6, m7
4645 packuswb m5, m6
4646
4647 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4648 pmulhrsw m6, m7
4649
4650 pslldq m2, 1
4651 palignr m3, m2, 14
4652
4653 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4654 pmulhrsw m1, m7
4655 packuswb m6, m1
4656
4657 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4658 pmulhrsw m1, m7
4659
4660 pslldq m2, 1
4661 palignr m3, m2, 14
4662
4663 pmaddubsw m3, [r4] ; [16]
4664 pmulhrsw m3, m7
4665 packuswb m1, m3
4666
4667 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4668
4669 lea r0, [r6 + r1 * 4]
4670 lea r6, [r6 + r1 * 8]
4671
4672 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4673 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4674 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4675 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
4676
4677 pmaddubsw m4, m3, [r4 - 16] ; [15]
4678 pmulhrsw m4, m7
4679
4680 palignr m3, m2, 14
4681
4682 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4683 pmulhrsw m5, m7
4684 packuswb m4, m5
4685
4686 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4687 pmulhrsw m5, m7
4688
4689 pslldq m2, 1
4690 palignr m3, m2, 14
4691
4692 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4693 pmulhrsw m6, m7
4694 packuswb m5, m6
4695
4696 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4697 pmulhrsw m6, m7
4698
4699 pslldq m2, 1
4700 palignr m3, m2, 14
4701
4702 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4703 pmulhrsw m0, m7
4704 packuswb m6, m0
4705
4706 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4707 pmulhrsw m1, m7
4708
4709 pslldq m2, 1
4710 palignr m3, m2, 14
4711
4712 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4713 pmulhrsw m0, m7
4714 packuswb m1, m0
4715
4716 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4717
4718 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4719 pmulhrsw m4, m7
4720
4721 pslldq m2, 1
4722 palignr m3, m2, 14
4723
4724 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4725 pmulhrsw m5, m7
4726 packuswb m4, m5
4727
4728 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4729 pmulhrsw m5, m7
4730
4731 pslldq m2, 1
4732 palignr m3, m2, 14
4733
4734 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4735 pmulhrsw m6, m7
4736 packuswb m5, m6
4737
4738 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4739 pmulhrsw m6, m7
4740
4741 pslldq m2, 1
4742 palignr m3, m2, 14
4743
4744 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4745 pmulhrsw m1, m7
4746 packuswb m6, m1
4747
4748 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4749 pmulhrsw m1, m7
4750
4751 pslldq m2, 1
4752 palignr m3, m2, 14
4753
4754 pmaddubsw m3, [r4] ; [16]
4755 pmulhrsw m3, m7
4756 packuswb m1, m3
4757
4758 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4759
4760 RET
4761
4762INIT_XMM sse4
4763cglobal intra_pred_ang16_21, 4,7,8
4764
4765 lea r4, [ang_table + 16 * 16]
4766 lea r5, [r1 * 3] ; r5 -> 3 * stride
4767 mov r6, r0
4768 mova m7, [pw_1024]
4769
4770 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4771 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4772 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4773 movu m2, [r2]
4774 pshufb m2, [c_mode16_15]
4775
4776 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4777
4778 pmaddubsw m4, [r4 - 16] ; [15]
4779 pmulhrsw m4, m7
4780
4781 palignr m3, m2, 15
4782
4783 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4784 pmulhrsw m5, m7
4785 packuswb m4, m5
4786
4787 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4788 pmulhrsw m5, m7
4789
4790 palignr m3, m2, 14
4791
4792 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4793 pmulhrsw m6, m7
4794 packuswb m5, m6
4795
4796 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4797 pmulhrsw m6, m7
4798
4799 pslldq m2, 1
4800 palignr m3, m2, 14
4801
4802 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4803 pmulhrsw m0, m7
4804 packuswb m6, m0
4805
4806 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4807 pmulhrsw m1, m7
4808
4809 pslldq m2, 1
4810 palignr m3, m2, 14
4811
4812 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4813 pmulhrsw m0, m7
4814 packuswb m1, m0
4815
4816 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4817
4818 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4819 pmulhrsw m4, m7
4820
4821 pslldq m2, 1
4822 palignr m3, m2, 14
4823
4824 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4825 pmulhrsw m5, m7
4826 packuswb m4, m5
4827
4828 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4829 pmulhrsw m5, m7
4830
4831 pslldq m2, 1
4832 palignr m3, m2, 14
4833
4834 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4835 pmulhrsw m6, m7
4836 packuswb m5, m6
4837
4838 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4839 pmulhrsw m6, m7
4840
4841 pslldq m2, 1
4842 palignr m3, m2, 14
4843
4844 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4845 pmulhrsw m1, m7
4846 packuswb m6, m1
4847
4848 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4849 pmulhrsw m1, m7
4850
4851 pslldq m2, 1
4852 palignr m3, m2, 14
4853
4854 pmaddubsw m3, [r4] ; [16]
4855 pmulhrsw m3, m7
4856 packuswb m1, m3
4857
4858 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4859
4860 lea r0, [r6 + 8]
4861
4862 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4863 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4864 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4865 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
4866
4867 pmaddubsw m4, m3, [r4 - 16] ; [15]
4868 pmulhrsw m4, m7
4869
4870 palignr m3, m2, 14
4871
4872 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4873 pmulhrsw m5, m7
4874 packuswb m4, m5
4875
4876 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4877 pmulhrsw m5, m7
4878
4879 pslldq m2, 1
4880 palignr m3, m2, 14
4881
4882 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4883 pmulhrsw m6, m7
4884 packuswb m5, m6
4885
4886 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4887 pmulhrsw m6, m7
4888
4889 pslldq m2, 1
4890 palignr m3, m2, 14
4891
4892 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4893 pmulhrsw m0, m7
4894 packuswb m6, m0
4895
4896 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4897 pmulhrsw m1, m7
4898
4899 pslldq m2, 1
4900 palignr m3, m2, 14
4901
4902 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4903 pmulhrsw m0, m7
4904 packuswb m1, m0
4905
4906 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4907
4908 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4909 pmulhrsw m4, m7
4910
4911 pslldq m2, 1
4912 palignr m3, m2, 14
4913
4914 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4915 pmulhrsw m5, m7
4916 packuswb m4, m5
4917
4918 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4919 pmulhrsw m5, m7
4920
4921 pslldq m2, 1
4922 palignr m3, m2, 14
4923
4924 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4925 pmulhrsw m6, m7
4926 packuswb m5, m6
4927
4928 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4929 pmulhrsw m6, m7
4930
4931 pslldq m2, 1
4932 palignr m3, m2, 14
4933
4934 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4935 pmulhrsw m1, m7
4936 packuswb m6, m1
4937
4938 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4939 pmulhrsw m1, m7
4940
4941 pslldq m2, 1
4942 palignr m3, m2, 14
4943
4944 pmaddubsw m3, [r4] ; [16]
4945 pmulhrsw m3, m7
4946 packuswb m1, m3
4947
4948 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4949
4950 RET
4951
4952INIT_XMM sse4
4953cglobal intra_pred_ang16_16, 4,7,8
4954
4955 lea r4, [ang_table + 16 * 16]
4956 lea r5, [r1 * 3] ; r5 -> 3 * stride
4957 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4958 mova m7, [pw_1024]
4959
4960 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4961 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4962 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4963 movu m2, [r3]
4964 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
4965 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4966
4967 pmaddubsw m4, [r4 - 5 * 16] ; [11]
4968 pmulhrsw m4, m7
4969
4970 palignr m3, m2, 15
4971
4972 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4973 pmulhrsw m5, m7
4974 packuswb m4, m5
4975
4976 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
4977 pmulhrsw m5, m7
4978
4979 palignr m3, m2, 14
4980
4981 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4982 pmulhrsw m6, m7
4983 packuswb m5, m6
4984
4985 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
4986 palignr m3, m2, 14
4987
4988 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4989 pmulhrsw m6, m7
4990 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
4991 pmulhrsw m0, m7
4992 packuswb m6, m0
4993
4994 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
4995 palignr m3, m2, 14
4996
4997 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
4998 pmulhrsw m1, m7
4999
5000 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
5001 palignr m3, m2, 14
5002
5003 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5004 pmulhrsw m0, m7
5005 packuswb m1, m0
5006
5007 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5008
5009 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5010 pmulhrsw m4, m7
5011
5012 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
5013 palignr m3, m2, 14
5014
5015 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5016 pmulhrsw m5, m7
5017 packuswb m4, m5
5018
5019 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
5020 palignr m3, m2, 14
5021
5022 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5023 pmulhrsw m5, m7
5024 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5025 pmulhrsw m6, m7
5026 packuswb m5, m6
5027
5028 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
5029 palignr m3, m2, 14
5030
5031 pmaddubsw m6, m3, [r4 - 16] ; [15]
5032 pmulhrsw m6, m7
5033
5034 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
5035 palignr m3, m2, 14
5036
5037 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5038 pmulhrsw m1, m7
5039 packuswb m6, m1
5040
5041 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5042 pmulhrsw m1, m7
5043
5044 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
5045 palignr m3, m2, 14
5046
5047 pmaddubsw m3, [r4] ; [16]
5048 pmulhrsw m3, m7
5049 packuswb m1, m3
5050
5051 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5052
5053 lea r0, [r6 + r1 * 4]
5054 lea r6, [r6 + r1 * 8]
5055
5056 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5057 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5058 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5059 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
5060 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
5061
5062 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
5063 pmulhrsw m4, m7
5064
5065 palignr m3, m2, 14
5066
5067 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
5068 pmulhrsw m5, m7
5069 packuswb m4, m5
5070
5071 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
5072 pmulhrsw m5, m7
5073
5074 pslldq m2, 1
5075 palignr m3, m2, 14
5076
5077 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
5078 pmulhrsw m6, m7
5079 packuswb m5, m6
5080
5081 pslldq m2, 1
5082 palignr m3, m2, 14
5083
5084 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
5085 pmulhrsw m6, m7
5086
5087 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
5088 pmulhrsw m0, m7
5089 packuswb m6, m0
5090
5091 pslldq m2, 1
5092 palignr m3, m2, 14
5093
5094 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
5095 pmulhrsw m1, m7
5096
5097 pslldq m2, 1
5098 palignr m3, m2, 14
5099
5100 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5101 pmulhrsw m0, m7
5102 packuswb m1, m0
5103
5104 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5105
5106 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5107 pmulhrsw m4, m7
5108
5109 pslldq m2, 1
5110 palignr m3, m2, 14
5111
5112 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5113 pmulhrsw m5, m7
5114 packuswb m4, m5
5115
5116 pslldq m2, 1
5117 palignr m3, m2, 14
5118
5119 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5120 pmulhrsw m5, m7
5121 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5122 pmulhrsw m6, m7
5123 packuswb m5, m6
5124
5125 pslldq m2, 1
5126 palignr m3, m2, 14
5127
5128 pmaddubsw m6, m3, [r4 - 16] ; [15]
5129 pmulhrsw m6, m7
5130
5131 pslldq m2, 1
5132 palignr m3, m2, 14
5133
5134 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5135 pmulhrsw m1, m7
5136 packuswb m6, m1
5137
5138 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5139 pmulhrsw m1, m7
5140
5141 pslldq m2, 1
5142 palignr m3, m2, 14
5143
5144 pmaddubsw m3, [r4] ; [16]
5145 pmulhrsw m3, m7
5146 packuswb m1, m3
5147
5148 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5149
5150 RET
5151
5152INIT_XMM sse4
5153cglobal intra_pred_ang16_20, 4,7,8
5154
5155 lea r4, [ang_table + 16 * 16]
5156 lea r5, [r1 * 3] ; r5 -> 3 * stride
5157 mov r6, r0
5158 mova m7, [pw_1024]
5159
5160 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
5161 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
5162 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
5163 movu m2, [r2]
5164 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
5165 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
5166
5167 pmaddubsw m4, [r4 - 5 * 16] ; [11]
5168 pmulhrsw m4, m7
5169
5170 palignr m3, m2, 15
5171
5172 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
5173 pmulhrsw m5, m7
5174 packuswb m4, m5
5175
5176 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
5177 pmulhrsw m5, m7
5178
5179 palignr m3, m2, 14
5180
5181 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
5182 pmulhrsw m6, m7
5183 packuswb m5, m6
5184
5185 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
5186 palignr m3, m2, 14
5187
5188 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
5189 pmulhrsw m6, m7
5190 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
5191 pmulhrsw m0, m7
5192 packuswb m6, m0
5193
5194 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
5195 palignr m3, m2, 14
5196
5197 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
5198 pmulhrsw m1, m7
5199
5200 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
5201 palignr m3, m2, 14
5202
5203 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5204 pmulhrsw m0, m7
5205 packuswb m1, m0
5206
5207 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5208
5209 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5210 pmulhrsw m4, m7
5211
5212 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
5213 palignr m3, m2, 14
5214
5215 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5216 pmulhrsw m5, m7
5217 packuswb m4, m5
5218
5219 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
5220 palignr m3, m2, 14
5221
5222 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5223 pmulhrsw m5, m7
5224 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5225 pmulhrsw m6, m7
5226 packuswb m5, m6
5227
5228 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
5229 palignr m3, m2, 14
5230
5231 pmaddubsw m6, m3, [r4 - 16] ; [15]
5232 pmulhrsw m6, m7
5233
5234 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
5235 palignr m3, m2, 14
5236
5237 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5238 pmulhrsw m1, m7
5239 packuswb m6, m1
5240
5241 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5242 pmulhrsw m1, m7
5243
5244 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
5245 palignr m3, m2, 14
5246
5247 pmaddubsw m3, [r4] ; [16]
5248 pmulhrsw m3, m7
5249 packuswb m1, m3
5250
5251 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5252
5253 lea r0, [r6 + 8]
5254
5255 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5256 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5257 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5258 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
5259 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
5260
5261 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
5262 pmulhrsw m4, m7
5263
5264 palignr m3, m2, 14
5265
5266 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
5267 pmulhrsw m5, m7
5268 packuswb m4, m5
5269
5270 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
5271 pmulhrsw m5, m7
5272
5273 pslldq m2, 1
5274 palignr m3, m2, 14
5275
5276 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
5277 pmulhrsw m6, m7
5278 packuswb m5, m6
5279
5280 pslldq m2, 1
5281 palignr m3, m2, 14
5282
5283 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
5284 pmulhrsw m6, m7
5285
5286 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
5287 pmulhrsw m0, m7
5288 packuswb m6, m0
5289
5290 pslldq m2, 1
5291 palignr m3, m2, 14
5292
5293 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
5294 pmulhrsw m1, m7
5295
5296 pslldq m2, 1
5297 palignr m3, m2, 14
5298
5299 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5300 pmulhrsw m0, m7
5301 packuswb m1, m0
5302
5303 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5304
5305 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5306 pmulhrsw m4, m7
5307
5308 pslldq m2, 1
5309 palignr m3, m2, 14
5310
5311 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5312 pmulhrsw m5, m7
5313 packuswb m4, m5
5314
5315 pslldq m2, 1
5316 palignr m3, m2, 14
5317
5318 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5319 pmulhrsw m5, m7
5320 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5321 pmulhrsw m6, m7
5322 packuswb m5, m6
5323
5324 pslldq m2, 1
5325 palignr m3, m2, 14
5326
5327 pmaddubsw m6, m3, [r4 - 16] ; [15]
5328 pmulhrsw m6, m7
5329
5330 pslldq m2, 1
5331 palignr m3, m2, 14
5332
5333 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5334 pmulhrsw m1, m7
5335 packuswb m6, m1
5336
5337 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5338 pmulhrsw m1, m7
5339
5340 pslldq m2, 1
5341 palignr m3, m2, 14
5342
5343 pmaddubsw m3, [r4] ; [16]
5344 pmulhrsw m3, m7
5345 packuswb m1, m3
5346
5347 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5348
5349 RET
5350
5351INIT_XMM sse4
5352cglobal intra_pred_ang16_17, 4,7,8
5353
5354 lea r4, [ang_table + 16 * 16]
5355 lea r5, [r1 * 3] ; r5 -> 3 * stride
5356 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
5357 mova m7, [pw_1024]
5358
5359 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
5360 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
5361 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
5362 movu m2, [r3]
5363 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
5364 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
5365
5366 pmaddubsw m4, [r4 - 10 * 16] ; [06]
5367 pmulhrsw m4, m7
5368
5369 palignr m3, m2, 15
5370
5371 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5372 pmulhrsw m5, m7
5373 packuswb m4, m5
5374
5375 palignr m3, m2, 14
5376
5377 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5378 pmulhrsw m5, m7
5379
5380 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
5381 pinsrb m2, [r3 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
5382 palignr m3, m2, 14
5383
5384 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5385 pmulhrsw m6, m7
5386 packuswb m5, m6
5387
5388 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
5389 palignr m3, m2, 14
5390
5391 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5392 pmulhrsw m6, m7
5393 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5394 pmulhrsw m0, m7
5395 packuswb m6, m0
5396
5397 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
5398 palignr m3, m2, 14
5399
5400 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5401 pmulhrsw m1, m7
5402
5403 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5404 palignr m3, m2, 14
5405
5406 pmaddubsw m0, m3, [r4] ; [16]
5407 pmulhrsw m0, m7
5408 packuswb m1, m0
5409
5410 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5411
5412 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
5413 palignr m3, m2, 14
5414
5415 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5416 pmulhrsw m4, m7
5417
5418 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
5419 palignr m3, m2, 14
5420
5421 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5422 pmulhrsw m5, m7
5423 packuswb m4, m5
5424
5425 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5426 pmulhrsw m5, m7
5427
5428 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
5429 palignr m3, m2, 14
5430
5431 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5432 pmulhrsw m6, m7
5433 packuswb m5, m6
5434
5435 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
5436 palignr m3, m2, 14
5437
5438 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5439 pmulhrsw m6, m7
5440
5441 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
5442 palignr m3, m2, 14
5443
5444 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5445 pmulhrsw m1, m7
5446 packuswb m6, m1
5447
5448 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
5449 palignr m3, m2, 14
5450
5451 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5452 pmulhrsw m1, m7
5453 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5454 pmulhrsw m3, m7
5455 packuswb m1, m3
5456
5457 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5458
5459 lea r0, [r6 + r1 * 4]
5460 lea r6, [r6 + r1 * 8]
5461
5462 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5463 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5464 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5465 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5466 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x]
5467
5468 pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
5469 pmulhrsw m4, m7
5470
5471 palignr m3, m2, 14
5472
5473 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5474 pmulhrsw m5, m7
5475 packuswb m4, m5
5476
5477 pslldq m2, 1
5478 palignr m3, m2, 14
5479
5480 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5481 pmulhrsw m5, m7
5482
5483 pslldq m2, 1
5484 palignr m3, m2, 14
5485
5486 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5487 pmulhrsw m6, m7
5488 packuswb m5, m6
5489
5490 pslldq m2, 1
5491 palignr m3, m2, 14
5492
5493 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5494 pmulhrsw m6, m7
5495 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5496 pmulhrsw m0, m7
5497 packuswb m6, m0
5498
5499 pslldq m2, 1
5500 palignr m3, m2, 14
5501
5502 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5503 pmulhrsw m1, m7
5504
5505 pslldq m2, 1
5506 palignr m3, m2, 14
5507
5508 pmaddubsw m0, m3, [r4] ; [16]
5509 pmulhrsw m0, m7
5510 packuswb m1, m0
5511
5512 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5513
5514 pslldq m2, 1
5515 palignr m3, m2, 14
5516
5517 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5518 pmulhrsw m4, m7
5519
5520 pslldq m2, 1
5521 palignr m3, m2, 14
5522
5523 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5524 pmulhrsw m5, m7
5525 packuswb m4, m5
5526
5527 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5528 pmulhrsw m5, m7
5529
5530 pslldq m2, 1
5531 palignr m3, m2, 14
5532
5533 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5534 pmulhrsw m6, m7
5535 packuswb m5, m6
5536
5537 pslldq m2, 1
5538 palignr m3, m2, 14
5539
5540 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5541 pmulhrsw m6, m7
5542
5543 pslldq m2, 1
5544 palignr m3, m2, 14
5545
5546 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5547 pmulhrsw m1, m7
5548 packuswb m6, m1
5549
5550 pslldq m2, 1
5551 palignr m3, m2, 14
5552
5553 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5554 pmulhrsw m1, m7
5555 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5556 pmulhrsw m3, m7
5557 packuswb m1, m3
5558
5559 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5560
5561 RET
5562
5563INIT_XMM sse4
5564cglobal intra_pred_ang16_19, 4,7,8
5565
5566 lea r4, [ang_table + 16 * 16]
5567 lea r5, [r1 * 3] ; r5 -> 3 * stride
5568 mov r6, r0
5569 mova m7, [pw_1024]
5570
5571 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
5572 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
5573 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
5574 movu m2, [r2]
5575 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
5576 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
5577
5578 pmaddubsw m4, [r4 - 10 * 16] ; [06]
5579 pmulhrsw m4, m7
5580
5581 palignr m3, m2, 15
5582
5583 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5584 pmulhrsw m5, m7
5585 packuswb m4, m5
5586
5587 palignr m3, m2, 14
5588
5589 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5590 pmulhrsw m5, m7
5591
5592 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
5593 pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
5594 palignr m3, m2, 14
5595
5596 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5597 pmulhrsw m6, m7
5598 packuswb m5, m6
5599
5600 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
5601 palignr m3, m2, 14
5602
5603 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5604 pmulhrsw m6, m7
5605 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5606 pmulhrsw m0, m7
5607 packuswb m6, m0
5608
5609 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
5610 palignr m3, m2, 14
5611
5612 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5613 pmulhrsw m1, m7
5614
5615 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5616 palignr m3, m2, 14
5617
5618 pmaddubsw m0, m3, [r4] ; [16]
5619 pmulhrsw m0, m7
5620 packuswb m1, m0
5621
5622 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5623
5624 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
5625 palignr m3, m2, 14
5626
5627 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5628 pmulhrsw m4, m7
5629
5630 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
5631 palignr m3, m2, 14
5632
5633 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5634 pmulhrsw m5, m7
5635 packuswb m4, m5
5636
5637 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5638 pmulhrsw m5, m7
5639
5640 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
5641 palignr m3, m2, 14
5642
5643 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5644 pmulhrsw m6, m7
5645 packuswb m5, m6
5646
5647 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
5648 palignr m3, m2, 14
5649
5650 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5651 pmulhrsw m6, m7
5652
5653 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
5654 palignr m3, m2, 14
5655
5656 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5657 pmulhrsw m1, m7
5658 packuswb m6, m1
5659
5660 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
5661 palignr m3, m2, 14
5662
5663 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5664 pmulhrsw m1, m7
5665 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5666 pmulhrsw m3, m7
5667 packuswb m1, m3
5668
5669 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5670
5671 lea r0, [r6 + 8]
5672
5673 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5674 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5675 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5676 palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5677 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
5678
5679 pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
5680 pmulhrsw m4, m7
5681
5682 palignr m3, m2, 14
5683
5684 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5685 pmulhrsw m5, m7
5686 packuswb m4, m5
5687
5688 pslldq m2, 1
5689 palignr m3, m2, 14
5690
5691 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5692 pmulhrsw m5, m7
5693
5694 pslldq m2, 1
5695 palignr m3, m2, 14
5696
5697 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5698 pmulhrsw m6, m7
5699 packuswb m5, m6
5700
5701 pslldq m2, 1
5702 palignr m3, m2, 14
5703
5704 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5705 pmulhrsw m6, m7
5706 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5707 pmulhrsw m0, m7
5708 packuswb m6, m0
5709
5710 pslldq m2, 1
5711 palignr m3, m2, 14
5712
5713 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5714 pmulhrsw m1, m7
5715
5716 pslldq m2, 1
5717 palignr m3, m2, 14
5718
5719 pmaddubsw m0, m3, [r4] ; [16]
5720 pmulhrsw m0, m7
5721 packuswb m1, m0
5722
5723 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5724
5725 pslldq m2, 1
5726 palignr m3, m2, 14
5727
5728 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5729 pmulhrsw m4, m7
5730
5731 pslldq m2, 1
5732 palignr m3, m2, 14
5733
5734 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5735 pmulhrsw m5, m7
5736 packuswb m4, m5
5737
5738 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5739 pmulhrsw m5, m7
5740
5741 pslldq m2, 1
5742 palignr m3, m2, 14
5743
5744 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5745 pmulhrsw m6, m7
5746 packuswb m5, m6
5747
5748 pslldq m2, 1
5749 palignr m3, m2, 14
5750
5751 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5752 pmulhrsw m6, m7
5753
5754 pslldq m2, 1
5755 palignr m3, m2, 14
5756
5757 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5758 pmulhrsw m1, m7
5759 packuswb m6, m1
5760
5761 pslldq m2, 1
5762 palignr m3, m2, 14
5763
5764 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5765 pmulhrsw m1, m7
5766 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5767 pmulhrsw m3, m7
5768 packuswb m1, m3
5769
5770 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5771
5772 RET
5773
5774INIT_XMM sse4
5775cglobal intra_pred_ang16_18, 4,5,3
5776
5777 movu m0, [r3]
5778 movu m1, [r2]
5779 mova m2, [c_mode16_18]
5780 pshufb m1, m2
5781
5782 lea r2, [r1 * 2]
5783 lea r3, [r1 * 3]
5784 lea r4, [r1 * 4]
5785 movu [r0], m0
5786 palignr m2, m0, m1, 15
5787 movu [r0 + r1], m2
5788 palignr m2, m0, m1, 14
5789 movu [r0 + r2], m2
5790 palignr m2, m0, m1, 13
5791 movu [r0 + r3], m2
5792 lea r0, [r0 + r4]
5793 palignr m2, m0, m1, 12
5794 movu [r0], m2
5795 palignr m2, m0, m1, 11
5796 movu [r0 + r1], m2
5797 palignr m2, m0, m1, 10
5798 movu [r0 + r2], m2
5799 palignr m2, m0, m1, 9
5800 movu [r0 + r3], m2
5801 lea r0, [r0 + r4]
5802 palignr m2, m0, m1, 8
5803 movu [r0], m2
5804 palignr m2, m0, m1, 7
5805 movu [r0 + r1], m2
5806 palignr m2, m0, m1, 6
5807 movu [r0 + r2], m2
5808 palignr m2, m0, m1, 5
5809 movu [r0 + r3], m2
5810 lea r0, [r0 + r4]
5811 palignr m2, m0, m1, 4
5812 movu [r0], m2
5813 palignr m2, m0, m1, 3
5814 movu [r0 + r1], m2
5815 palignr m2, m0, m1, 2
5816 movu [r0 + r2], m2
5817 palignr m0, m1, 1
5818 movu [r0 + r3], m0
5819 RET
5820
5821;---------------------------------------------------------------------------------------------------------------
5822; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
5823;---------------------------------------------------------------------------------------------------------------
5824INIT_XMM ssse3
5825cglobal intra_pred_ang32_2, 3,4,4
5826 cmp r4m, byte 34
5827 cmove r2, r3mp
5828 movu m0, [r2 + 2]
5829 movu m1, [r2 + 18]
5830 movu m3, [r2 + 34]
5831
5832 lea r3, [r1 * 3]
5833
5834 movu [r0], m0
5835 movu [r0 + 16], m1
5836 palignr m2, m1, m0, 1
5837 movu [r0 + r1], m2
5838 palignr m2, m3, m1, 1
5839 movu [r0 + r1 + 16], m2
5840 palignr m2, m1, m0, 2
5841 movu [r0 + r1 * 2], m2
5842 palignr m2, m3, m1, 2
5843 movu [r0 + r1 * 2 + 16], m2
5844 palignr m2, m1, m0, 3
5845 movu [r0 + r3], m2
5846 palignr m2, m3, m1, 3
5847 movu [r0 + r3 + 16], m2
5848
5849 lea r0, [r0 + r1 * 4]
5850
5851 palignr m2, m1, m0, 4
5852 movu [r0], m2
5853 palignr m2, m3, m1, 4
5854 movu [r0 + 16], m2
5855 palignr m2, m1, m0, 5
5856 movu [r0 + r1], m2
5857 palignr m2, m3, m1, 5
5858 movu [r0 + r1 + 16], m2
5859 palignr m2, m1, m0, 6
5860 movu [r0 + r1 * 2], m2
5861 palignr m2, m3, m1, 6
5862 movu [r0 + r1 * 2 + 16], m2
5863 palignr m2, m1, m0, 7
5864 movu [r0 + r3], m2
5865 palignr m2, m3, m1, 7
5866 movu [r0 + r3 + 16], m2
5867
5868 lea r0, [r0 + r1 * 4]
5869
5870 palignr m2, m1, m0, 8
5871 movu [r0], m2
5872 palignr m2, m3, m1, 8
5873 movu [r0 + 16], m2
5874 palignr m2, m1, m0, 9
5875 movu [r0 + r1], m2
5876 palignr m2, m3, m1, 9
5877 movu [r0 + r1 + 16], m2
5878 palignr m2, m1, m0, 10
5879 movu [r0 + r1 * 2], m2
5880 palignr m2, m3, m1, 10
5881 movu [r0 + r1 * 2 + 16], m2
5882 palignr m2, m1, m0, 11
5883 movu [r0 + r3], m2
5884 palignr m2, m3, m1, 11
5885 movu [r0 + r3 + 16], m2
5886
5887 lea r0, [r0 + r1 * 4]
5888
5889 palignr m2, m1, m0, 12
5890 movu [r0], m2
5891 palignr m2, m3, m1, 12
5892 movu [r0 + 16], m2
5893 palignr m2, m1, m0, 13
5894 movu [r0 + r1], m2
5895 palignr m2, m3, m1, 13
5896 movu [r0 + r1 + 16], m2
5897 palignr m2, m1, m0, 14
5898 movu [r0 + r1 * 2], m2
5899 palignr m2, m3, m1, 14
5900 movu [r0 + r1 * 2 + 16], m2
5901 palignr m2, m1, m0, 15
5902 movu [r0 + r3], m2
5903 palignr m2, m3, m1, 15
5904 movu [r0 + r3 + 16], m2
5905
5906 lea r0, [r0 + r1 * 4]
5907
5908 movu [r0], m1
5909 movu m0, [r2 + 50]
5910 movu [r0 + 16], m3
5911 palignr m2, m3, m1, 1
5912 movu [r0 + r1], m2
5913 palignr m2, m0, m3, 1
5914 movu [r0 + r1 + 16], m2
5915 palignr m2, m3, m1, 2
5916 movu [r0 + r1 * 2], m2
5917 palignr m2, m0, m3, 2
5918 movu [r0 + r1 * 2 + 16], m2
5919 palignr m2, m3, m1, 3
5920 movu [r0 + r3], m2
5921 palignr m2, m0, m3, 3
5922 movu [r0 + r3 + 16], m2
5923
5924 lea r0, [r0 + r1 * 4]
5925
5926 palignr m2, m3, m1, 4
5927 movu [r0], m2
5928 palignr m2, m0, m3, 4
5929 movu [r0 + 16], m2
5930 palignr m2, m3, m1, 5
5931 movu [r0 + r1], m2
5932 palignr m2, m0, m3, 5
5933 movu [r0 + r1 + 16], m2
5934 palignr m2, m3, m1, 6
5935 movu [r0 + r1 * 2], m2
5936 palignr m2, m0, m3, 6
5937 movu [r0 + r1 * 2 + 16], m2
5938 palignr m2, m3, m1, 7
5939 movu [r0 + r3], m2
5940 palignr m2, m0, m3, 7
5941 movu [r0 + r3 + 16], m2
5942
5943 lea r0, [r0 + r1 * 4]
5944
5945 palignr m2, m3, m1, 8
5946 movu [r0], m2
5947 palignr m2, m0, m3, 8
5948 movu [r0 + 16], m2
5949 palignr m2, m3, m1, 9
5950 movu [r0 + r1], m2
5951 palignr m2, m0, m3, 9
5952 movu [r0 + r1 + 16], m2
5953 palignr m2, m3, m1, 10
5954 movu [r0 + r1 * 2], m2
5955 palignr m2, m0, m3, 10
5956 movu [r0 + r1 * 2 + 16], m2
5957 palignr m2, m3, m1, 11
5958 movu [r0 + r3], m2
5959 palignr m2, m0, m3, 11
5960 movu [r0 + r3 + 16], m2
5961
5962 lea r0, [r0 + r1 * 4]
5963
5964 palignr m2, m3, m1, 12
5965 movu [r0], m2
5966 palignr m2, m0, m3, 12
5967 movu [r0 + 16], m2
5968 palignr m2, m3, m1, 13
5969 movu [r0 + r1], m2
5970 palignr m2, m0, m3, 13
5971 movu [r0 + r1 + 16], m2
5972 palignr m2, m3, m1, 14
5973 movu [r0 + r1 * 2], m2
5974 palignr m2, m0, m3, 14
5975 movu [r0 + r1 * 2 + 16], m2
5976 palignr m2, m3, m1, 15
5977 movu [r0 + r3], m2
5978 palignr m2, m0, m3, 15
5979 movu [r0 + r3 + 16], m2
5980 RET
5981
5982; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8
5983%macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7
5984 %if %3 == 0
5985 %else
5986 pshufb m0, [r3]
5987 pmaddubsw m0, [r4 + %3 * 16]
5988 pmulhrsw m0, [pw_1024]
5989 %endif
5990 %if %4 == 0
5991 pmovzxbw m1, m1
5992 %else
5993 pshufb m1, [r3]
5994 pmaddubsw m1, [r4 + %4 * 16]
5995 pmulhrsw m1, [pw_1024]
5996 %endif
5997 %if %3 == 0
5998 packuswb m1, m1
5999 movlhps m0, m1
6000 %else
6001 packuswb m0, m1
6002 %endif
6003 mova m1, [pw_1024]
6004 %if %5 == 0
6005 %else
6006 pshufb m2, [r3]
6007 pmaddubsw m2, [r4 + %5 * 16]
6008 pmulhrsw m2, m1
6009 %endif
6010 %if %6 == 0
6011 pmovzxbw m3, m3
6012 %else
6013 pshufb m3, [r3]
6014 pmaddubsw m3, [r4 + %6 * 16]
6015 pmulhrsw m3, m1
6016 %endif
6017 %if %5 == 0
6018 packuswb m3, m3
6019 movlhps m2, m3
6020 %else
6021 packuswb m2, m3
6022 %endif
6023 %if %7 == 0
6024 %else
6025 pshufb m4, [r3]
6026 pmaddubsw m4, [r4 + %7 * 16]
6027 pmulhrsw m4, m1
6028 %endif
6029 %if %8 == 0
6030 pmovzxbw m5, m5
6031 %else
6032 pshufb m5, [r3]
6033 pmaddubsw m5, [r4 + %8 * 16]
6034 pmulhrsw m5, m1
6035 %endif
6036 %if %7 == 0
6037 packuswb m5, m5
6038 movlhps m4, m5
6039 %else
6040 packuswb m4, m5
6041 %endif
6042 %if %9 == 0
6043 %else
6044 pshufb m6, [r3]
6045 pmaddubsw m6, [r4 + %9 * 16]
6046 pmulhrsw m6, m1
6047 %endif
6048 %if %10 == 0
6049 pmovzxbw m7, m7
6050 %else
6051 pshufb m7, [r3]
6052 pmaddubsw m7, [r4 + %10 * 16]
6053 pmulhrsw m7, m1
6054 %endif
6055 %if %9 == 0
6056 packuswb m7, m7
6057 movlhps m6, m7
6058 %else
6059 packuswb m6, m7
6060 %endif
6061
6062 %if %2 == 1
6063 ; transpose
6064 punpckhbw m1, m0, m2
6065 punpcklbw m0, m2
6066 punpckhbw m3, m0, m1
6067 punpcklbw m0, m1
6068
6069 punpckhbw m1, m4, m6
6070 punpcklbw m4, m6
6071 punpckhbw m6, m4, m1
6072 punpcklbw m4, m1
6073
6074 punpckhdq m2, m0, m4
6075 punpckldq m0, m4
6076 punpckldq m4, m3, m6
6077 punpckhdq m3, m6
6078
6079 movh [r0 + + %1 * 8], m0
6080 movhps [r0 + r1 + %1 * 8], m0
6081 movh [r0 + r1*2 + %1 * 8], m2
6082 movhps [r0 + r5 + %1 * 8], m2
6083 movh [r6 + %1 * 8], m4
6084 movhps [r6 + r1 + %1 * 8], m4
6085 movh [r6 + r1*2 + %1 * 8], m3
6086 movhps [r6 + r5 + %1 * 8], m3
6087 %else
6088 movh [r0 ], m0
6089 movhps [r0 + r1 ], m0
6090 movh [r0 + r1 * 2], m2
6091 movhps [r0 + r5 ], m2
6092 lea r0, [r0 + r1 * 4]
6093 movh [r0 ], m4
6094 movhps [r0 + r1 ], m4
6095 movh [r0 + r1 * 2], m6
6096 movhps [r0 + r5 ], m6
6097 %endif
6098%endmacro
6099
6100%macro MODE_3_33 1
6101 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
6102 palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
6103 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
6104 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
6105 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
6106 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
6107 pmulhrsw m4, m7
6108 pmaddubsw m1, [r3 + 4 * 16] ; [20]
6109 pmulhrsw m1, m7
6110 packuswb m4, m1
6111 palignr m5, m2, m0, 4
6112 pmaddubsw m5, [r3 - 2 * 16] ; [14]
6113 pmulhrsw m5, m7
6114 palignr m6, m2, m0, 6
6115 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
6116 pmulhrsw m6, m7
6117 packuswb m5, m6
6118 palignr m1, m2, m0, 8
6119 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
6120 pmulhrsw m6, m7
6121 pmaddubsw m1, [r3 + 12 * 16] ; [28]
6122 pmulhrsw m1, m7
6123 packuswb m6, m1
6124 palignr m1, m2, m0, 10
6125 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6126 pmulhrsw m1, m7
6127 palignr m2, m0, 12
6128 pmaddubsw m2, [r3] ; [16]
6129 pmulhrsw m2, m7
6130 packuswb m1, m2
6131
6132 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6133
6134 movu m0, [r2 + 8]
6135 palignr m1, m0, 1
6136 punpckhbw m2, m0, m1
6137 punpcklbw m0, m1
6138 palignr m5, m2, m0, 2
6139 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
6140 pmulhrsw m4, m7
6141 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
6142 pmulhrsw m1, m7
6143 packuswb m4, m1
6144 pmaddubsw m5, [r3 + 14 * 16] ; [30]
6145 pmulhrsw m5, m7
6146 palignr m6, m2, m0, 4
6147 pmaddubsw m6, [r3 + 8 * 16] ; [24]
6148 pmulhrsw m6, m7
6149 packuswb m5, m6
6150 palignr m1, m2, m0, 6
6151 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
6152 pmulhrsw m6, m7
6153 palignr m1, m2, m0, 8
6154 pmaddubsw m1, [r3 - 4 * 16] ; [12]
6155 pmulhrsw m1, m7
6156 packuswb m6, m1
6157 palignr m1, m2, m0, 10
6158 pmaddubsw m1, [r3 - 10 * 16] ; [06]
6159 pmulhrsw m1, m7
6160 packuswb m1, m1
6161 movhps m1, [r2 + 14] ; [00]
6162
6163 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6164
6165 movu m0, [r2 + 14]
6166 palignr m1, m0, 1
6167 punpckhbw m2, m0, m1
6168 punpcklbw m0, m1
6169 palignr m1, m2, m0, 2
6170 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
6171 pmulhrsw m4, m7
6172 pmaddubsw m1, [r3 + 4 * 16] ; [20]
6173 pmulhrsw m1, m7
6174 packuswb m4, m1
6175 palignr m5, m2, m0, 4
6176 pmaddubsw m5, [r3 - 2 * 16] ; [14]
6177 pmulhrsw m5, m7
6178 palignr m6, m2, m0, 6
6179 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
6180 pmulhrsw m6, m7
6181 packuswb m5, m6
6182 palignr m1, m2, m0, 8
6183 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
6184 pmulhrsw m6, m7
6185 pmaddubsw m1, [r3 + 12 * 16] ; [28]
6186 pmulhrsw m1, m7
6187 packuswb m6, m1
6188 palignr m1, m2, m0, 10
6189 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6190 pmulhrsw m1, m7
6191 palignr m2, m0, 12
6192 pmaddubsw m2, [r3] ; [16]
6193 pmulhrsw m2, m7
6194 packuswb m1, m2
6195
6196 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6197
6198 movu m0, [r2 + 21]
6199 palignr m1, m0, 1
6200 punpckhbw m2, m0, m1
6201 punpcklbw m0, m1
6202 palignr m5, m2, m0, 2
6203 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
6204 pmulhrsw m4, m7
6205 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
6206 pmulhrsw m1, m7
6207 packuswb m4, m1
6208 pmaddubsw m5, [r3 + 14 * 16] ; [30]
6209 pmulhrsw m5, m7
6210 palignr m6, m2, m0, 4
6211 pmaddubsw m6, [r3 + 8 * 16] ; [24]
6212 pmulhrsw m6, m7
6213 packuswb m5, m6
6214 palignr m1, m2, m0, 6
6215 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
6216 pmulhrsw m6, m7
6217 palignr m1, m2, m0, 8
6218 pmaddubsw m1, [r3 - 4 * 16] ; [12]
6219 pmulhrsw m1, m7
6220 packuswb m6, m1
6221 palignr m1, m2, m0, 10
6222 pmaddubsw m1, [r3 - 10 * 16] ; [06]
6223 pmulhrsw m1, m7
6224 packuswb m1, m1
6225 movhps m1, [r2 + 27] ; [00]
6226
6227 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6228%endmacro
6229;------------------------------------------------------------------------------------------------------------------
6230; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6231;------------------------------------------------------------------------------------------------------------------
6232INIT_XMM sse4
6233cglobal intra_pred_ang32_3, 3,7,8
6234 lea r3, [ang_table + 16 * 16]
6235 mov r4d, 4
6236 lea r5, [r1 * 3] ; r5 -> 3 * stride
6237 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6238 mova m7, [pw_1024]
6239.loop:
6240 MODE_3_33 1
6241 lea r0, [r6 + r1 * 4]
6242 lea r6, [r6 + r1 * 8]
6243 add r2, 8
6244 dec r4
6245 jnz .loop
6246 RET
6247
6248%macro MODE_4_32 1
6249 movu m0, [r2 + 1]
6250 palignr m1, m0, 1
6251 punpckhbw m2, m0, m1
6252 punpcklbw m0, m1
6253 palignr m1, m2, m0, 2
6254 mova m5, m1
6255 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
6256 pmulhrsw m4, m7
6257 pmaddubsw m1, [r3 - 6 * 16] ; [10]
6258 pmulhrsw m1, m7
6259 packuswb m4, m1
6260 pmaddubsw m5, [r3 + 15 * 16] ; [31]
6261 pmulhrsw m5, m7
6262 palignr m6, m2, m0, 4
6263 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
6264 pmulhrsw m6, m7
6265 packuswb m5, m6
6266 palignr m1, m2, m0, 6
6267 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
6268 pmulhrsw m6, m7
6269 pmaddubsw m1, [r3 + 14 * 16] ; [30]
6270 pmulhrsw m1, m7
6271 packuswb m6, m1
6272 palignr m1, m2, m0, 8
6273 pmaddubsw m1, [r3 + 3 * 16] ; [19]
6274 pmulhrsw m1, m7
6275 palignr m2, m0, 10
6276 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
6277 pmulhrsw m3, m7
6278 packuswb m1, m3
6279
6280 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6281
6282 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
6283 pmulhrsw m4, m7
6284 movu m0, [r2 + 6]
6285 palignr m1, m0, 1
6286 punpckhbw m2, m0, m1
6287 punpcklbw m0, m1
6288 palignr m1, m2, m0, 2
6289 pmaddubsw m1, [r3 + 2 * 16] ; [18]
6290 pmulhrsw m1, m7
6291 packuswb m4, m1
6292 palignr m5, m2, m0, 4
6293 mova m6, m5
6294 pmaddubsw m5, [r3 - 9 * 16] ; [07]
6295 pmulhrsw m5, m7
6296 pmaddubsw m6, [r3 + 12 * 16] ; [28]
6297 pmulhrsw m6, m7
6298 packuswb m5, m6
6299 palignr m6, m2, m0, 6
6300 pmaddubsw m6, [r3 + 16] ; [17]
6301 pmulhrsw m6, m7
6302 palignr m1, m2, m0, 8
6303 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
6304 pmulhrsw m3, m7
6305 packuswb m6, m3
6306 pmaddubsw m1, [r3 + 11 * 16] ; [27]
6307 pmulhrsw m1, m7
6308 palignr m2, m0, 10
6309 pmaddubsw m2, [r3] ; [16]
6310 pmulhrsw m2, m7
6311 packuswb m1, m2
6312
6313 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6314
6315 movu m0, [r2 + 12]
6316 palignr m1, m0, 1
6317 punpckhbw m2, m0, m1
6318 punpcklbw m0, m1
6319 mova m1, m0
6320 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
6321 pmulhrsw m4, m7
6322 pmaddubsw m1, [r3 + 10 * 16] ; [26]
6323 pmulhrsw m1, m7
6324 packuswb m4, m1
6325 palignr m5, m2, m0, 2
6326 pmaddubsw m5, [r3 - 16] ; [15]
6327 pmulhrsw m5, m7
6328 palignr m6, m2, m0, 4
6329 mova m1, m6
6330 pmaddubsw m1, [r3 - 12 * 16] ; [4]
6331 pmulhrsw m1, m7
6332 packuswb m5, m1
6333 pmaddubsw m6, [r3 + 9 * 16] ; [25]
6334 pmulhrsw m6, m7
6335 palignr m1, m2, m0, 6
6336 pmaddubsw m1, [r3 - 2 * 16] ; [14]
6337 pmulhrsw m1, m7
6338 packuswb m6, m1
6339 palignr m1, m2, m0, 8
6340 mova m2, m1
6341 pmaddubsw m1, [r3 - 13 * 16] ; [3]
6342 pmulhrsw m1, m7
6343 pmaddubsw m2, [r3 + 8 * 16] ; [24]
6344 pmulhrsw m2, m7
6345 packuswb m1, m2
6346
6347 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6348
6349 movu m0, [r2 + 17]
6350 palignr m1, m0, 1
6351 punpckhbw m2, m0, m1
6352 punpcklbw m0, m1
6353 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
6354 pmulhrsw m4, m7
6355 palignr m5, m2, m0, 2
6356 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
6357 pmulhrsw m1, m7
6358 packuswb m4, m1
6359 pmaddubsw m5, [r3 + 7 * 16] ; [23]
6360 pmulhrsw m5, m7
6361 palignr m6, m2, m0, 4
6362 pmaddubsw m6, [r3 - 4 * 16] ; [12]
6363 pmulhrsw m6, m7
6364 packuswb m5, m6
6365 palignr m6, m2, m0, 6
6366 mova m1, m6
6367 pmaddubsw m6, [r3 - 15 * 16] ; [1]
6368 pmulhrsw m6, m7
6369 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6370 pmulhrsw m1, m7
6371 packuswb m6, m1
6372 palignr m1, m2, m0, 8
6373 pmaddubsw m1, [r3 - 5 * 16] ; [11]
6374 pmulhrsw m1, m7
6375 packuswb m1, m1
6376 movhps m1, [r2 + 22] ; [00]
6377
6378 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6379%endmacro
6380;-----------------------------------------------------------------------------------------------------------------
6381; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6382;-----------------------------------------------------------------------------------------------------------------
6383INIT_XMM sse4
6384cglobal intra_pred_ang32_4, 3,7,8
6385 lea r3, [ang_table + 16 * 16]
6386 mov r4d, 4
6387 lea r5, [r1 * 3] ; r5 -> 3 * stride
6388 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6389 mova m7, [pw_1024]
6390.loop:
6391 MODE_4_32 1
6392 lea r0, [r6 + r1 * 4]
6393 lea r6, [r6 + r1 * 8]
6394 add r2, 8
6395 dec r4
6396 jnz .loop
6397 RET
6398
6399%macro MODE_5_31 1
6400 movu m0, [r2 + 1]
6401 palignr m1, m0, 1
6402 punpckhbw m2, m0, m1
6403 punpcklbw m0, m1
6404 palignr m1, m2, m0, 2
6405 mova m5, m1
6406 pmaddubsw m4, m0, [r3 + 16] ; [17]
6407 pmulhrsw m4, m7
6408 pmaddubsw m1, [r3 - 14 * 16] ; [2]
6409 pmulhrsw m1, m7
6410 packuswb m4, m1
6411 pmaddubsw m5, [r3 + 3 * 16] ; [19]
6412 pmulhrsw m5, m7
6413 palignr m6, m2, m0, 4
6414 mova m1, m6
6415 pmaddubsw m6, [r3 - 12 * 16] ; [4]
6416 pmulhrsw m6, m7
6417 packuswb m5, m6
6418 pmaddubsw m6, m1, [r3 + 5 * 16] ; [21]
6419 pmulhrsw m6, m7
6420 palignr m1, m2, m0, 6
6421 mova m3, m1
6422 pmaddubsw m3, [r3 - 10 * 16] ; [6]
6423 pmulhrsw m3, m7
6424 packuswb m6, m3
6425 pmaddubsw m1, [r3 + 7 * 16] ; [23]
6426 pmulhrsw m1, m7
6427 palignr m2, m0, 8
6428 pmaddubsw m2, [r3 - 8 * 16] ; [8]
6429 pmulhrsw m2, m7
6430 packuswb m1, m2
6431
6432 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6433
6434 movu m0, [r2 + 5]
6435 palignr m1, m0, 1
6436 punpckhbw m2, m0, m1
6437 punpcklbw m0, m1
6438 palignr m1, m2, m0, 2
6439 mova m5, m1
6440 pmaddubsw m4, m0, [r3 + 9 * 16] ; [25]
6441 pmulhrsw m4, m7
6442 pmaddubsw m1, [r3 - 6 * 16] ; [10]
6443 pmulhrsw m1, m7
6444 packuswb m4, m1
6445 pmaddubsw m5, [r3 + 11 * 16] ; [27]
6446 pmulhrsw m5, m7
6447 palignr m6, m2, m0, 4
6448 mova m1, m6
6449 pmaddubsw m6, [r3 - 4 * 16] ; [12]
6450 pmulhrsw m6, m7
6451 packuswb m5, m6
6452 pmaddubsw m6, m1, [r3 + 13 * 16] ; [29]
6453 pmulhrsw m6, m7
6454 palignr m1, m2, m0, 6
6455 mova m3, m1
6456 pmaddubsw m3, [r3 - 2 * 16] ; [14]
6457 pmulhrsw m3, m7
6458 packuswb m6, m3
6459 pmaddubsw m1, [r3 + 15 * 16] ; [31]
6460 pmulhrsw m1, m7
6461 palignr m2, m0, 8
6462 pmaddubsw m2, [r3] ; [16]
6463 pmulhrsw m2, m7
6464 packuswb m1, m2
6465
6466 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6467
6468 movu m0, [r2 + 10]
6469 palignr m1, m0, 1
6470 punpckhbw m2, m0, m1
6471 punpcklbw m0, m1
6472 mova m1, m0
6473 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
6474 pmulhrsw m4, m7
6475 pmaddubsw m1, [r3 + 2 * 16] ; [18]
6476 pmulhrsw m1, m7
6477 packuswb m4, m1
6478 palignr m5, m2, m0, 2
6479 mova m1, m5
6480 pmaddubsw m5, [r3 - 13 * 16] ; [3]
6481 pmulhrsw m5, m7
6482 pmaddubsw m1, [r3 + 4 * 16] ; [20]
6483 pmulhrsw m1, m7
6484 packuswb m5, m1
6485 palignr m1, m2, m0, 4
6486 pmaddubsw m6, m1, [r3 - 11 * 16] ; [5]
6487 pmulhrsw m6, m7
6488 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6489 pmulhrsw m1, m7
6490 packuswb m6, m1
6491 palignr m2, m0, 6
6492 pmaddubsw m1, m2, [r3 - 9 * 16] ; [7]
6493 pmulhrsw m1, m7
6494 pmaddubsw m2, [r3 + 8 * 16] ; [24]
6495 pmulhrsw m2, m7
6496 packuswb m1, m2
6497
6498 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6499
6500 movu m0, [r2 + 14]
6501 palignr m1, m0, 1
6502 punpckhbw m2, m0, m1
6503 punpcklbw m0, m1
6504 mova m1, m0
6505 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
6506 pmulhrsw m4, m7
6507 pmaddubsw m1, [r3 + 10 * 16] ; [26]
6508 pmulhrsw m1, m7
6509 packuswb m4, m1
6510 palignr m5, m2, m0, 2
6511 mova m1, m5
6512 pmaddubsw m5, [r3 - 5 * 16] ; [11]
6513 pmulhrsw m5, m7
6514 pmaddubsw m1, [r3 + 12 * 16] ; [28]
6515 pmulhrsw m1, m7
6516 packuswb m5, m1
6517 palignr m1, m2, m0, 4
6518 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
6519 pmulhrsw m6, m7
6520 pmaddubsw m1, [r3 + 14 * 16] ; [30]
6521 pmulhrsw m1, m7
6522 packuswb m6, m1
6523 palignr m2, m0, 6
6524 pmaddubsw m1, m2, [r3 - 16] ; [15]
6525 pmulhrsw m1, m7
6526 packuswb m1, m1
6527 movhps m1, [r2 + 18] ; [00]
6528
6529 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6530%endmacro
6531;------------------------------------------------------------------------------------------------------------------
6532; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6533;------------------------------------------------------------------------------------------------------------------
6534INIT_XMM sse4
6535cglobal intra_pred_ang32_5, 3,7,8
6536 lea r3, [ang_table + 16 * 16]
6537 mov r4d, 4
6538 lea r5, [r1 * 3] ; r5 -> 3 * stride
6539 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6540 mova m7, [pw_1024]
6541.loop:
6542 MODE_5_31 1
6543 lea r0, [r6 + r1 * 4]
6544 lea r6, [r6 + r1 * 8]
6545 add r2, 8
6546 dec r4
6547 jnz .loop
6548 RET
6549
6550%macro MODE_6_30 1
6551 movu m0, [r2 + 1]
6552 palignr m1, m0, 1
6553 punpckhbw m2, m0, m1
6554 punpcklbw m0, m1
6555 mova m1, m0
6556 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
6557 pmulhrsw m4, m7
6558 pmaddubsw m1, [r3 + 10 * 16] ; [26]
6559 pmulhrsw m1, m7
6560 packuswb m4, m1
6561 palignr m6, m2, m0, 2
6562 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
6563 pmulhrsw m5, m7
6564 pmaddubsw m6, [r3 + 4 * 16] ; [20]
6565 pmulhrsw m6, m7
6566 packuswb m5, m6
6567 palignr m1, m2, m0, 4
6568 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
6569 pmulhrsw m6, m7
6570 pmaddubsw m3, m1, [r3 - 2 * 16] ; [14]
6571 pmulhrsw m3, m7
6572 packuswb m6, m3
6573 pmaddubsw m1, [r3 + 11 * 16] ; [27]
6574 pmulhrsw m1, m7
6575 palignr m2, m0, 6
6576 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
6577 pmulhrsw m3, m7
6578 packuswb m1, m3
6579
6580 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6581
6582 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
6583 pmulhrsw m4, m7
6584 movu m0, [r2 + 5]
6585 palignr m1, m0, 1
6586 punpckhbw m2, m0, m1
6587 punpcklbw m0, m1
6588 mova m6, m0
6589 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
6590 pmulhrsw m1, m7
6591 packuswb m4, m1
6592 pmaddubsw m5, m6, [r3 - 16] ; [15]
6593 pmulhrsw m5, m7
6594 pmaddubsw m6, [r3 + 12 * 16] ; [28]
6595 pmulhrsw m6, m7
6596 packuswb m5, m6
6597 palignr m3, m2, m0, 2
6598 pmaddubsw m6, m3, [r3 - 7 * 16] ; [9]
6599 pmulhrsw m6, m7
6600 pmaddubsw m3, [r3 + 6 * 16] ; [22]
6601 pmulhrsw m3, m7
6602 packuswb m6, m3
6603 palignr m2, m0, 4
6604 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
6605 pmulhrsw m1, m7
6606 pmaddubsw m3, m2, [r3] ; [16]
6607 pmulhrsw m3, m7
6608 packuswb m1, m3
6609
6610 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6611
6612 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
6613 pmulhrsw m4, m7
6614 movu m0, [r2 + 7]
6615 palignr m1, m0, 1
6616 punpckhbw m2, m0, m1
6617 punpcklbw m0, m1
6618 palignr m5, m2, m0, 2
6619 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
6620 pmulhrsw m1, m7
6621 packuswb m4, m1
6622 pmaddubsw m5, [r3 + 7 * 16] ; [23]
6623 pmulhrsw m5, m7
6624 palignr m1, m2, m0, 4
6625 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
6626 pmulhrsw m6, m7
6627 packuswb m5, m6
6628 pmaddubsw m6, m1, [r3 + 16] ; [17]
6629 pmulhrsw m6, m7
6630 pmaddubsw m1, [r3 + 14 * 16] ; [30]
6631 pmulhrsw m1, m7
6632 packuswb m6, m1
6633 palignr m2, m2, m0, 6
6634 pmaddubsw m1, m2, [r3 - 5 * 16] ; [11]
6635 pmulhrsw m1, m7
6636 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
6637 pmulhrsw m2, m7
6638 packuswb m1, m2
6639
6640 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6641
6642 movu m0, [r2 + 11]
6643 palignr m1, m0, 1
6644 punpckhbw m2, m0, m1
6645 punpcklbw m0, m1
6646 mova m5, m0
6647 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
6648 pmulhrsw m4, m7
6649 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
6650 pmulhrsw m3, m7
6651 packuswb m4, m3
6652 pmaddubsw m5, [r3 + 15 * 16] ; [31]
6653 pmulhrsw m5, m7
6654 palignr m6, m2, m0, 2
6655 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
6656 pmulhrsw m1, m7
6657 packuswb m5, m1
6658 pmaddubsw m6, [r3 + 9 * 16] ; [25]
6659 pmulhrsw m6, m7
6660 palignr m1, m2, m0, 4
6661 pmaddubsw m2, m1, [r3 - 10 * 16] ; [6]
6662 pmulhrsw m2, m7
6663 packuswb m6, m2
6664 pmaddubsw m1, [r3 + 3 * 16] ; [19]
6665 pmulhrsw m1, m7
6666 packuswb m1, m1
6667 movhps m1, [r2 + 14] ; [00]
6668
6669 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6670%endmacro
6671;------------------------------------------------------------------------------------------------------------------
6672; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6673;------------------------------------------------------------------------------------------------------------------
6674INIT_XMM sse4
6675cglobal intra_pred_ang32_6, 3,7,8
6676 lea r3, [ang_table + 16 * 16]
6677 mov r4d, 4
6678 lea r5, [r1 * 3] ; r5 -> 3 * stride
6679 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6680 mova m7, [pw_1024]
6681.loop:
6682 MODE_6_30 1
6683 lea r0, [r6 + r1 * 4]
6684 lea r6, [r6 + r1 * 8]
6685 add r2, 8
6686 dec r4
6687 jnz .loop
6688 RET
6689
6690%macro MODE_7_29 1
6691 movu m0, [r2 + 1]
6692 palignr m1, m0, 1
6693 punpckhbw m2, m0, m1
6694 punpcklbw m0, m1
6695 mova m5, m0
6696 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
6697 pmulhrsw m4, m7
6698 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
6699 pmulhrsw m3, m7
6700 packuswb m4, m3
6701 pmaddubsw m5, [r3 + 11 * 16] ; [27]
6702 pmulhrsw m5, m7
6703 palignr m1, m2, m0, 2
6704 palignr m2, m0, 4
6705 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
6706 pmulhrsw m6, m7
6707 packuswb m5, m6
6708 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
6709 pmulhrsw m6, m7
6710 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
6711 pmulhrsw m0, m7
6712 packuswb m6, m0
6713 pmaddubsw m1, [r3 + 15 * 16] ; [31]
6714 pmulhrsw m1, m7
6715 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
6716 pmulhrsw m0, m7
6717 packuswb m1, m0
6718
6719 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6720
6721 pmaddubsw m4, m2, [r3 + 16] ; [17]
6722 pmulhrsw m4, m7
6723 pmaddubsw m2, [r3 + 10 * 16] ; [26]
6724 pmulhrsw m2, m7
6725 packuswb m4, m2
6726 movu m0, [r2 + 4]
6727 palignr m1, m0, 1
6728 punpckhbw m2, m0, m1
6729 punpcklbw m0, m1
6730 palignr m2, m0, 2
6731 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
6732 pmulhrsw m5, m7
6733 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
6734 pmulhrsw m6, m7
6735 packuswb m5, m6
6736 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
6737 pmulhrsw m6, m7
6738 pmaddubsw m0, [r3 + 14 * 16] ; [30]
6739 pmulhrsw m0, m7
6740 packuswb m6, m0
6741 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
6742 pmulhrsw m1, m7
6743 pmaddubsw m3, m2, [r3] ; [16]
6744 pmulhrsw m3, m7
6745 packuswb m1, m3
6746
6747 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6748
6749 pmaddubsw m4, m2, [r3 + 9 * 16] ; [25]
6750 pmulhrsw m4, m7
6751 movu m0, [r2 + 6]
6752 palignr m1, m0, 1
6753 punpckhbw m2, m0, m1
6754 punpcklbw m0, m1
6755 palignr m2, m0, 2
6756 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
6757 pmulhrsw m1, m7
6758 packuswb m4, m1
6759 pmaddubsw m5, m0, [r3 - 5 * 16] ; [11]
6760 pmulhrsw m5, m7
6761 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
6762 pmulhrsw m6, m7
6763 packuswb m5, m6
6764 pmaddubsw m6, m0, [r3 + 13 * 16] ; [29]
6765 pmulhrsw m6, m7
6766 pmaddubsw m1, m2, [r3 - 10 * 16] ; [6]
6767 pmulhrsw m1, m7
6768 packuswb m6, m1
6769 pmaddubsw m1, m2, [r3 - 16] ; [15]
6770 pmulhrsw m1, m7
6771 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
6772 pmulhrsw m2, m7
6773 packuswb m1, m2
6774
6775 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6776
6777 movu m0, [r2 + 8]
6778 palignr m1, m0, 1
6779 punpckhbw m2, m0, m1
6780 punpcklbw m0, m1
6781 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
6782 pmulhrsw m4, m7
6783 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
6784 pmulhrsw m3, m7
6785 packuswb m4, m3
6786 pmaddubsw m5, m0, [r3 + 3 * 16] ; [19]
6787 pmulhrsw m5, m7
6788 pmaddubsw m6, m0, [r3 + 12 * 16] ; [28]
6789 pmulhrsw m6, m7
6790 packuswb m5, m6
6791 palignr m2, m0, 2
6792 pmaddubsw m6, m2, [r3 - 11 * 16] ; [5]
6793 pmulhrsw m6, m7
6794 pmaddubsw m0, m2, [r3 - 2 * 16] ; [14]
6795 pmulhrsw m0, m7
6796 packuswb m6, m0
6797 pmaddubsw m1, m2, [r3 + 7 * 16] ; [23]
6798 pmulhrsw m1, m7
6799 packuswb m1, m1
6800 movhps m1, [r2 + 10] ; [0]
6801
6802 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6803%endmacro
6804;------------------------------------------------------------------------------------------------------------------
6805; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6806;------------------------------------------------------------------------------------------------------------------
6807INIT_XMM sse4
6808cglobal intra_pred_ang32_7, 3,7,8
6809 lea r3, [ang_table + 16 * 16]
6810 mov r4d, 4
6811 lea r5, [r1 * 3] ; r5 -> 3 * stride
6812 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6813 mova m7, [pw_1024]
6814.loop:
6815 MODE_7_29 1
6816 lea r0, [r6 + r1 * 4]
6817 lea r6, [r6 + r1 * 8]
6818 add r2, 8
6819 dec r4
6820 jnz .loop
6821 RET
6822
6823%macro MODE_8_28 1
6824 movu m0, [r2 + 1]
6825 palignr m1, m0, 1
6826 punpckhbw m2, m0, m1
6827 punpcklbw m0, m1
6828 palignr m2, m0, 2
6829 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
6830 pmulhrsw m4, m7
6831 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
6832 pmulhrsw m3, m7
6833 packuswb m4, m3
6834 pmaddubsw m5, m0, [r3 - 1 * 16] ; [15]
6835 pmulhrsw m5, m7
6836 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
6837 pmulhrsw m6, m7
6838 packuswb m5, m6
6839 pmaddubsw m6, m0, [r3 + 9 * 16] ; [25]
6840 pmulhrsw m6, m7
6841 pmaddubsw m0, [r3 + 14 * 16] ; [30]
6842 pmulhrsw m0, m7
6843 packuswb m6, m0
6844 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
6845 pmulhrsw m1, m7
6846 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
6847 pmulhrsw m0, m7
6848 packuswb m1, m0
6849
6850 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6851
6852 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
6853 pmulhrsw m4, m7
6854 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
6855 pmulhrsw m5, m7
6856 packuswb m4, m5
6857 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
6858 pmulhrsw m5, m7
6859 pmaddubsw m2, [r3 + 12 * 16] ; [28]
6860 pmulhrsw m2, m7
6861 packuswb m5, m2
6862 movu m0, [r2 + 3]
6863 palignr m1, m0, 1
6864 punpckhbw m2, m0, m1
6865 punpcklbw m0, m1
6866 pmaddubsw m6, m0, [r3 - 15 * 16] ; [01]
6867 pmulhrsw m6, m7
6868 pmaddubsw m1, m0, [r3 - 10 * 16] ; [06]
6869 pmulhrsw m1, m7
6870 packuswb m6, m1
6871 pmaddubsw m1, m0, [r3 - 5 * 16] ; [11]
6872 pmulhrsw m1, m7
6873 mova m2, m0
6874 pmaddubsw m0, [r3] ; [16]
6875 pmulhrsw m0, m7
6876 packuswb m1, m0
6877
6878 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6879
6880 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
6881 pmulhrsw m4, m7
6882 pmaddubsw m5, m2, [r3 + 10 * 16] ; [26]
6883 pmulhrsw m5, m7
6884 packuswb m4, m5
6885 pmaddubsw m5, m2, [r3 + 15 * 16] ; [31]
6886 pmulhrsw m5, m7
6887 movu m0, [r2 + 4]
6888 palignr m1, m0, 1
6889 punpckhbw m2, m0, m1
6890 punpcklbw m0, m1
6891 pmaddubsw m2, m0, [r3 - 12 * 16] ; [4]
6892 pmulhrsw m2, m7
6893 packuswb m5, m2
6894 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
6895 pmulhrsw m6, m7
6896 pmaddubsw m1, m0, [r3 - 2 * 16] ; [14]
6897 pmulhrsw m1, m7
6898 packuswb m6, m1
6899 pmaddubsw m1, m0, [r3 + 3 * 16] ; [19]
6900 pmulhrsw m1, m7
6901 mova m2, m0
6902 pmaddubsw m0, [r3 + 8 * 16] ; [24]
6903 pmulhrsw m0, m7
6904 packuswb m1, m0
6905
6906 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6907
6908 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
6909 pmulhrsw m4, m7
6910 movu m0, [r2 + 5]
6911 palignr m1, m0, 1
6912 punpckhbw m2, m0, m1
6913 punpcklbw m0, m1
6914 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
6915 pmulhrsw m1, m7
6916 packuswb m4, m1
6917 pmaddubsw m5, m0, [r3 - 9 * 16] ; [7]
6918 pmulhrsw m5, m7
6919 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
6920 pmulhrsw m6, m7
6921 packuswb m5, m6
6922 pmaddubsw m6, m0, [r3 + 16] ; [17]
6923 pmulhrsw m6, m7
6924 pmaddubsw m1, m0, [r3 + 6 * 16] ; [22]
6925 pmulhrsw m1, m7
6926 packuswb m6, m1
6927 pmaddubsw m1, m0, [r3 + 11 * 16] ; [27]
6928 pmulhrsw m1, m7
6929 packuswb m1, m1
6930 movhps m1, [r2 + 6] ; [00]
6931
6932 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6933%endmacro
6934;------------------------------------------------------------------------------------------------------------------
6935; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6936;------------------------------------------------------------------------------------------------------------------
6937INIT_XMM sse4
6938cglobal intra_pred_ang32_8, 3,7,8
6939 lea r3, [ang_table + 16 * 16]
6940 mov r4d, 4
6941 lea r5, [r1 * 3] ; r5 -> 3 * stride
6942 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6943 mova m7, [pw_1024]
6944.loop:
6945 MODE_8_28 1
6946 lea r0, [r6 + r1 * 4]
6947 lea r6, [r6 + r1 * 8]
6948 add r2, 8
6949 dec r4
6950 jnz .loop
6951 RET
6952
6953%macro MODE_9_27 1
6954 movu m2, [r2 + 1]
6955 palignr m1, m2, 1
6956 punpckhbw m0, m2, m1
6957 punpcklbw m2, m1
6958 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
6959 pmulhrsw m4, m7
6960 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
6961 pmulhrsw m3, m7
6962 packuswb m4, m3
6963 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
6964 pmulhrsw m5, m7
6965 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
6966 pmulhrsw m6, m7
6967 packuswb m5, m6
6968 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
6969 pmulhrsw m6, m7
6970 pmaddubsw m3, m2, [r3 - 4 * 16] ; [12]
6971 pmulhrsw m3, m7
6972 packuswb m6, m3
6973 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
6974 pmulhrsw m1, m7
6975 pmaddubsw m0, m2, [r3] ; [16]
6976 pmulhrsw m0, m7
6977 packuswb m1, m0
6978
6979 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6980
6981 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
6982 pmulhrsw m4, m7
6983 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
6984 pmulhrsw m5, m7
6985 packuswb m4, m5
6986 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
6987 pmulhrsw m5, m7
6988 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
6989 pmulhrsw m6, m7
6990 packuswb m5, m6
6991 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
6992 pmulhrsw m6, m7
6993 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
6994 pmulhrsw m1, m7
6995 packuswb m6, m1
6996 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
6997 pmulhrsw m1, m7
6998 packuswb m1, m1
6999 movhps m1, [r2 + 2] ; [00]
7000
7001 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7002
7003 movu m2, [r2 + 2]
7004 palignr m1, m2, 1
7005 punpcklbw m2, m1
7006 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
7007 pmulhrsw m4, m7
7008 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
7009 pmulhrsw m3, m7
7010 packuswb m4, m3
7011 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
7012 pmulhrsw m5, m7
7013 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
7014 pmulhrsw m6, m7
7015 packuswb m5, m6
7016 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
7017 pmulhrsw m6, m7
7018 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
7019 pmulhrsw m0, m7
7020 packuswb m6, m0
7021 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
7022 pmulhrsw m1, m7
7023 pmaddubsw m0, m2, [r3] ; [16]
7024 pmulhrsw m0, m7
7025 packuswb m1, m0
7026
7027 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7028
7029 movu m2, [r2 + 2]
7030 palignr m1, m2, 1
7031 punpcklbw m2, m1
7032 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
7033 pmulhrsw m4, m7
7034 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
7035 pmulhrsw m5, m7
7036 packuswb m4, m5
7037 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
7038 pmulhrsw m5, m7
7039 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
7040 pmulhrsw m6, m7
7041 packuswb m5, m6
7042 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
7043 pmulhrsw m6, m7
7044 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
7045 pmulhrsw m1, m7
7046 packuswb m6, m1
7047 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
7048 pmulhrsw m1, m7
7049 packuswb m1, m1
7050 movhps m1, [r2 + 3] ; [00]
7051
7052 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7053%endmacro
7054;------------------------------------------------------------------------------------------------------------------
7055; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7056;------------------------------------------------------------------------------------------------------------------
7057INIT_XMM sse4
7058cglobal intra_pred_ang32_9, 3,7,8
7059 lea r3, [ang_table + 16 * 16]
7060 mov r4d, 4
7061 lea r5, [r1 * 3] ; r5 -> 3 * stride
7062 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7063 mova m7, [pw_1024]
7064.loop:
7065 MODE_9_27 1
7066 lea r0, [r6 + r1 * 4]
7067 lea r6, [r6 + r1 * 8]
7068 add r2, 8
7069 dec r4
7070 jnz .loop
7071 RET
7072
7073;------------------------------------------------------------------------------------------------------------------
7074; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7075;------------------------------------------------------------------------------------------------------------------
7076INIT_XMM sse4
7077cglobal intra_pred_ang32_10, 6,7,8,0-(2*mmsize)
7078%define m8 [rsp + 0 * mmsize]
7079%define m9 [rsp + 1 * mmsize]
7080 lea r4, [r1 * 3]
7081 pxor m7, m7
7082 mov r6, 2
7083 movu m0, [r3]
7084 movu m1, [r3 + 1]
7085 mova m8, m0
7086 mova m9, m1
7087 mov r3d, r5d
7088
7089.loop:
7090 movu m0, [r2 + 1]
7091 palignr m1, m0, 1
7092 pshufb m1, m7
7093 palignr m2, m0, 2
7094 pshufb m2, m7
7095 palignr m3, m0, 3
7096 pshufb m3, m7
7097 palignr m4, m0, 4
7098 pshufb m4, m7
7099 palignr m5, m0, 5
7100 pshufb m5, m7
7101 palignr m6, m0, 6
7102 pshufb m6, m7
7103
7104 movu [r0 + r1], m1
7105 movu [r0 + r1 + 16], m1
7106 movu [r0 + r1 * 2], m2
7107 movu [r0 + r1 * 2 + 16], m2
7108 movu [r0 + r4], m3
7109 movu [r0 + r4 + 16], m3
7110 lea r5, [r0 + r1 * 4]
7111 movu [r5], m4
7112 movu [r5 + 16], m4
7113 movu [r5 + r1], m5
7114 movu [r5 + r1 + 16], m5
7115 movu [r5 + r1 * 2], m6
7116 movu [r5 + r1 * 2 + 16], m6
7117
7118 palignr m1, m0, 7
7119 pshufb m1, m7
7120 movhlps m2, m0
7121 pshufb m2, m7
7122 palignr m3, m0, 9
7123 pshufb m3, m7
7124 palignr m4, m0, 10
7125 pshufb m4, m7
7126 palignr m5, m0, 11
7127 pshufb m5, m7
7128 palignr m6, m0, 12
7129 pshufb m6, m7
7130
7131 movu [r5 + r4], m1
7132 movu [r5 + r4 + 16], m1
7133 lea r5, [r5 + r1 * 4]
7134 movu [r5], m2
7135 movu [r5 + 16], m2
7136 movu [r5 + r1], m3
7137 movu [r5 + r1 + 16], m3
7138 movu [r5 + r1 * 2], m4
7139 movu [r5 + r1 * 2 + 16], m4
7140 movu [r5 + r4], m5
7141 movu [r5 + r4 + 16], m5
7142 lea r5, [r5 + r1 * 4]
7143 movu [r5], m6
7144 movu [r5 + 16], m6
7145
7146 palignr m1, m0, 13
7147 pshufb m1, m7
7148 palignr m2, m0, 14
7149 pshufb m2, m7
7150 palignr m3, m0, 15
7151 pshufb m3, m7
7152 pshufb m0, m7
7153
7154 movu [r5 + r1], m1
7155 movu [r5 + r1 + 16], m1
7156 movu [r5 + r1 * 2], m2
7157 movu [r5 + r1 * 2 + 16], m2
7158 movu [r5 + r4], m3
7159 movu [r5 + r4 + 16], m3
7160
7161; filter
7162 cmp r3d, byte 0
7163 jz .quit
7164 movhlps m1, m0
7165 pmovzxbw m0, m0
7166 mova m1, m0
7167 movu m2, m8
7168 movu m3, m9
7169
7170 pshufb m2, m7
7171 pmovzxbw m2, m2
7172 movhlps m4, m3
7173 pmovzxbw m3, m3
7174 pmovzxbw m4, m4
7175 psubw m3, m2
7176 psubw m4, m2
7177 psraw m3, 1
7178 psraw m4, 1
7179 paddw m0, m3
7180 paddw m1, m4
7181 packuswb m0, m1
7182
7183.quit:
7184 movu [r0], m0
7185 movu [r0 + 16], m0
7186 dec r6
7187 lea r0, [r5 + r1 * 4]
7188 lea r2, [r2 + 16]
7189 jnz .loop
7190 RET
7191
7192;-------------------------------------------------------------------------------------------------------------------
7193; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7194;-------------------------------------------------------------------------------------------------------------------
7195INIT_XMM sse4
7196cglobal intra_pred_ang32_11, 4,7,8
7197 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7198
7199 mov r6, rsp
7200 sub rsp, 64+gprsize
7201 and rsp, ~63
7202 mov [rsp+64], r6
7203
7204 ; collect reference pixel
7205 movu m0, [r3 + 16]
7206 pxor m1, m1
7207 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
7208 mova [rsp], m0
7209 movu m0, [r2]
7210 movu m1, [r2 + 16]
7211 movu m2, [r2 + 32]
7212 movu [rsp + 1], m0
7213 movu [rsp + 1 + 16], m1
7214 movu [rsp + 1 + 32], m2
7215 mov [rsp + 63], byte 4
7216
7217 ; filter
7218 lea r2, [rsp + 1] ; r2 -> [0]
7219 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7220 lea r4, [ang_table] ; r4 -> ang_table
7221 lea r5, [r1 * 3] ; r5 -> 3 * stride
7222 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7223 mova m5, [pw_1024] ; m5 -> 1024
7224 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7225
7226.loop:
7227 ; Row[0 - 7]
7228 movu m7, [r2]
7229 mova m0, m7
7230 mova m1, m7
7231 mova m2, m7
7232 mova m3, m7
7233 mova m4, m7
7234 mova m5, m7
7235 mova m6, m7
7236 PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16
7237
7238 ; Row[8 - 15]
7239 movu m7, [r2]
7240 mova m0, m7
7241 mova m1, m7
7242 mova m2, m7
7243 mova m3, m7
7244 mova m4, m7
7245 mova m5, m7
7246 mova m6, m7
7247 PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0
7248
7249 ; Row[16 - 23]
7250 movu m7, [r2 - 1]
7251 mova m0, m7
7252 mova m1, m7
7253 mova m2, m7
7254 mova m3, m7
7255 mova m4, m7
7256 mova m5, m7
7257 mova m6, m7
7258 PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16
7259
7260 ; Row[24 - 31]
7261 movu m7, [r2 - 1]
7262 mova m0, m7
7263 mova m1, m7
7264 mova m2, m7
7265 mova m3, m7
7266 mova m4, m7
7267 mova m5, m7
7268 mova m6, m7
7269 PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0
7270
7271 lea r0, [r6 + r1 * 4]
7272 lea r6, [r6 + r1 * 8]
7273 add r2, 8
7274 dec byte [rsp + 63]
7275 jnz .loop
7276 mov rsp, [rsp+64]
7277 RET
7278
7279%macro MODE_12_24_ROW0 1
7280 movu m0, [r3 + 6]
7281 pshufb m0, [c_mode32_12_0]
7282 pinsrb m0, [r3 + 26], 12
7283 mova above, m0
7284 movu m2, [r2]
7285 palignr m1, m2, 1
7286 punpcklbw m2, m1
7287 pmaddubsw m4, m2, [r4 + 11 * 16] ; [27]
7288 pmulhrsw m4, m7
7289 pmaddubsw m3, m2, [r4 + 6 * 16] ; [22]
7290 pmulhrsw m3, m7
7291 packuswb m4, m3
7292 pmaddubsw m5, m2, [r4 + 16] ; [17]
7293 pmulhrsw m5, m7
7294 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
7295 pmulhrsw m6, m7
7296 packuswb m5, m6
7297 pmaddubsw m6, m2, [r4 - 9 * 16] ; [7]
7298 pmulhrsw m6, m7
7299 pmaddubsw m3, m2, [r4 - 14 * 16] ; [2]
7300 pmulhrsw m3, m7
7301 packuswb m6, m3
7302 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7303 palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
7304 punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
7305 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
7306 pmulhrsw m1, m7
7307 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
7308 pmulhrsw m3, m7
7309 packuswb m1, m3
7310 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7311 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
7312 pmulhrsw m4, m7
7313 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
7314 pmulhrsw m5, m7
7315 packuswb m4, m5
7316 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
7317 pmulhrsw m5, m7
7318 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7319 pmulhrsw m6, m7
7320 packuswb m5, m6
7321 palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
7322 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
7323 pmulhrsw m6, m7
7324 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
7325 pmulhrsw m1, m7
7326 packuswb m6, m1
7327 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
7328 pmulhrsw m1, m7
7329 pmaddubsw m3, m2, [r4] ; [16]
7330 pmulhrsw m3, m7
7331 packuswb m1, m3
7332 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7333 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
7334 pmulhrsw m4, m7
7335 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
7336 pmulhrsw m3, m7
7337 packuswb m4, m3
7338 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
7339 pmulhrsw m5, m7
7340 pslldq m1, above, 1
7341 palignr m2, m1, 14
7342 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7343 pmulhrsw m6, m7
7344 packuswb m5, m6
7345 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
7346 pmulhrsw m6, m7
7347 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
7348 pmulhrsw m3, m7
7349 packuswb m6, m3
7350 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
7351 pmulhrsw m1, m7
7352 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
7353 pmulhrsw m3, m7
7354 packuswb m1, m3
7355 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7356 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
7357 pmulhrsw m4, m7
7358 pslldq m1, above, 2
7359 palignr m2, m1, 14
7360 pmaddubsw m5, m2, [r4 + 14 * 16] ; [30]
7361 pmulhrsw m5, m7
7362 packuswb m4, m5
7363 pmaddubsw m5, m2, [r4 + 9 * 16] ; [25]
7364 pmulhrsw m5, m7
7365 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
7366 pmulhrsw m6, m7
7367 packuswb m5, m6
7368 pmaddubsw m6, m2, [r4 - 16] ; [15]
7369 pmulhrsw m6, m7
7370 pmaddubsw m1, m2, [r4 - 6 * 16] ; [10]
7371 pmulhrsw m1, m7
7372 packuswb m6, m1
7373 pmaddubsw m1, m2, [r4 - 11 * 16] ; [05]
7374 pmulhrsw m1, m7
7375 movu m0, [pb_fact0]
7376 pshufb m2, m0
7377 pmovzxbw m2, m2
7378 packuswb m1, m2
7379 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7380%endmacro
7381
7382%macro MODE_12_24 1
7383 movu m2, [r2]
7384 palignr m1, m2, 1
7385 punpckhbw m0, m2, m1
7386 punpcklbw m2, m1
7387 palignr m0, m2, 2
7388 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
7389 pmulhrsw m4, m7
7390 pmaddubsw m3, m0, [r4 + 6 * 16] ; [22]
7391 pmulhrsw m3, m7
7392 packuswb m4, m3
7393 pmaddubsw m5, m0, [r4 + 16] ; [17]
7394 pmulhrsw m5, m7
7395 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
7396 pmulhrsw m6, m7
7397 packuswb m5, m6
7398 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
7399 pmulhrsw m6, m7
7400 pmaddubsw m3, m0, [r4 - 14 * 16] ; [2]
7401 pmulhrsw m3, m7
7402 packuswb m6, m3
7403 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
7404 pmulhrsw m1, m7
7405 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
7406 pmulhrsw m3, m7
7407 packuswb m1, m3
7408 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7409 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
7410 pmulhrsw m4, m7
7411 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
7412 pmulhrsw m5, m7
7413 packuswb m4, m5
7414 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
7415 pmulhrsw m5, m7
7416 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7417 pmulhrsw m6, m7
7418 packuswb m5, m6
7419 movu m0, [r2 - 2]
7420 palignr m1, m0, 1
7421 punpckhbw m2, m0, m1
7422 punpcklbw m0, m1
7423 palignr m2, m0, 2
7424 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
7425 pmulhrsw m6, m7
7426 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
7427 pmulhrsw m1, m7
7428 packuswb m6, m1
7429 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
7430 pmulhrsw m1, m7
7431 pmaddubsw m3, m2, [r4] ; [16]
7432 pmulhrsw m3, m7
7433 packuswb m1, m3
7434 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7435 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
7436 pmulhrsw m4, m7
7437 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
7438 pmulhrsw m3, m7
7439 packuswb m4, m3
7440 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
7441 pmulhrsw m5, m7
7442 movu m0, [r2 - 3]
7443 palignr m1, m0, 1
7444 punpckhbw m2, m0, m1
7445 punpcklbw m0, m1
7446 palignr m2, m0, 2
7447 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7448 pmulhrsw m6, m7
7449 packuswb m5, m6
7450 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
7451 pmulhrsw m6, m7
7452 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
7453 pmulhrsw m3, m7
7454 packuswb m6, m3
7455 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
7456 pmulhrsw m1, m7
7457 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
7458 pmulhrsw m3, m7
7459 packuswb m1, m3
7460 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7461 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
7462 pmulhrsw m4, m7
7463 movu m2, [r2 - 4]
7464 palignr m1, m2, 1
7465 punpckhbw m0, m2, m1
7466 punpcklbw m2, m1
7467 palignr m0, m2, 2
7468 pmaddubsw m5, m0, [r4 + 14 * 16] ; [30]
7469 pmulhrsw m5, m7
7470 packuswb m4, m5
7471 pmaddubsw m5, m0, [r4 + 9 * 16] ; [25]
7472 pmulhrsw m5, m7
7473 pmaddubsw m6, m0, [r4 + 4 * 16] ; [20]
7474 pmulhrsw m6, m7
7475 packuswb m5, m6
7476 pmaddubsw m6, m0, [r4 - 16] ; [15]
7477 pmulhrsw m6, m7
7478 pmaddubsw m1, m0, [r4 - 6 * 16] ; [10]
7479 pmulhrsw m1, m7
7480 packuswb m6, m1
7481 pmaddubsw m1, m0, [r4 - 11 * 16] ; [05]
7482 pmulhrsw m1, m7
7483 movu m2, [pb_fact0]
7484 pshufb m0, m2
7485 pmovzxbw m0, m0
7486 packuswb m1, m0
7487 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7488%endmacro
7489;-----------------------------------------------------------------------------------------------------------------
7490; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7491;-----------------------------------------------------------------------------------------------------------------
7492INIT_XMM sse4
7493cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize)
7494 %define above [rsp + 0 * mmsize]
7495
7496 lea r4, [ang_table + 16 * 16]
7497 lea r5, [r1 * 3] ; r5 -> 3 * stride
7498 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7499 mova m7, [pw_1024]
7500
7501 MODE_12_24_ROW0 1
7502 lea r0, [r6 + r1 * 4]
7503 lea r6, [r6 + r1 * 8]
7504 add r2, 7
7505 mov r3, 3
7506.loop:
7507 MODE_12_24 1
7508 lea r0, [r6 + r1 * 4]
7509 lea r6, [r6 + r1 * 8]
7510 add r2, 8
7511 dec r3
7512 jnz .loop
7513 RET
7514
7515%macro MODE_13_23_ROW0 1
7516 movu m0, [r3 + 1]
7517 movu m1, [r3 + 15]
7518 pshufb m0, [c_mode32_13_0]
7519 pshufb m1, [c_mode32_13_0]
7520 punpckldq m0, m1
7521 pshufb m0, [c_mode32_13_shuf]
7522 mova above, m0
7523 movu m2, [r2]
7524 palignr m1, m2, 1
7525 punpcklbw m2, m1
7526 pmaddubsw m4, m2, [r4 + 7 * 16] ; [23]
7527 pmulhrsw m4, m7
7528 pmaddubsw m3, m2, [r4 - 2 * 16] ; [14]
7529 pmulhrsw m3, m7
7530 packuswb m4, m3
7531 pmaddubsw m5, m2, [r4 - 11 * 16] ; [5]
7532 pmulhrsw m5, m7
7533 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7534 palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
7535 punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
7536 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7537 pmulhrsw m6, m7
7538 packuswb m5, m6
7539 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
7540 pmulhrsw m6, m7
7541 pmaddubsw m0, m2, [r4 - 6 * 16] ; [10]
7542 pmulhrsw m0, m7
7543 packuswb m6, m0
7544 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
7545 pmulhrsw m1, m7
7546 palignr m2, above, 14
7547 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
7548 pmulhrsw m3, m7
7549 packuswb m1, m3
7550 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7551 pmaddubsw m4, m2, [r4 - 16] ; [15]
7552 pmulhrsw m4, m7
7553 pmaddubsw m5, m2, [r4 - 10 * 16] ; [6]
7554 pmulhrsw m5, m7
7555 packuswb m4, m5
7556 pslldq m0, above, 1
7557 palignr m2, m0, 14
7558 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
7559 pmulhrsw m5, m7
7560 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
7561 pmulhrsw m6, m7
7562 packuswb m5, m6
7563 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
7564 pmulhrsw m6, m7
7565 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
7566 pmulhrsw m1, m7
7567 packuswb m6, m1
7568 pslldq m0, 1
7569 palignr m2, m0, 14
7570 pmaddubsw m1, m2, [r4 + 9 * 16] ; [25]
7571 pmulhrsw m1, m7
7572 pmaddubsw m0, m2, [r4] ; [16]
7573 pmulhrsw m0, m7
7574 packuswb m1, m0
7575 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7576 pmaddubsw m4, m2, [r4 - 9 * 16] ; [7]
7577 pmulhrsw m4, m7
7578 pslldq m0, above, 3
7579 palignr m2, m0, 14
7580 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
7581 pmulhrsw m3, m7
7582 packuswb m4, m3
7583 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
7584 pmulhrsw m5, m7
7585 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
7586 pmulhrsw m6, m7
7587 packuswb m5, m6
7588 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
7589 pmulhrsw m6, m7
7590 pslldq m0, 1
7591 palignr m2, m0, 14
7592 pmaddubsw m0, m2, [r4 + 10 * 16] ; [26]
7593 pmulhrsw m0, m7
7594 packuswb m6, m0
7595 pmaddubsw m1, m2, [r4 + 16] ; [17]
7596 pmulhrsw m1, m7
7597 pmaddubsw m0, m2, [r4 - 8 * 16] ; [8]
7598 pmulhrsw m0, m7
7599 packuswb m1, m0
7600 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7601 pslldq m0, above, 5
7602 palignr m2, m0, 14
7603 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
7604 pmulhrsw m4, m7
7605 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
7606 pmulhrsw m5, m7
7607 packuswb m4, m5
7608 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
7609 pmulhrsw m5, m7
7610 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7611 pmulhrsw m6, m7
7612 packuswb m5, m6
7613 pslldq m0, 1
7614 palignr m2, m0, 14
7615 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
7616 pmulhrsw m6, m7
7617 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
7618 pmulhrsw m1, m7
7619 packuswb m6, m1
7620 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
7621 pmulhrsw m1, m7
7622 pmaddubsw m3, m2, [r4 - 16 * 16] ; [00]
7623 pmulhrsw m3, m7
7624 packuswb m1, m3
7625 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7626%endmacro
7627
7628%macro MODE_13_23 1
7629 movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7630 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7631 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
7632 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7633 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
7634 pmaddubsw m4, m0, [r4 + 7 * 16] ; [23]
7635 pmulhrsw m4, m7
7636 pmaddubsw m3, m0, [r4 - 2 * 16] ; [14]
7637 pmulhrsw m3, m7
7638 packuswb m4, m3
7639 pmaddubsw m5, m0, [r4 - 11 * 16] ; [05]
7640 pmulhrsw m5, m7
7641 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7642 pmulhrsw m6, m7
7643 packuswb m5, m6
7644 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
7645 pmulhrsw m6, m7
7646 pmaddubsw m3, m2, [r4 - 6 * 16] ; [10]
7647 pmulhrsw m3, m7
7648 packuswb m6, m3
7649 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
7650 pmulhrsw m1, m7
7651 movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1]
7652 palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7653 punpckhbw m0, m2, m3
7654 punpcklbw m2, m3
7655 palignr m0, m2, 2
7656 pmaddubsw m3, m0, [r4 + 8 * 16] ; [24]
7657 pmulhrsw m3, m7
7658 packuswb m1, m3
7659 mova m3, m0
7660 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7661 pmaddubsw m4, m3, [r4 - 16] ; [15]
7662 pmulhrsw m4, m7
7663 pmaddubsw m5, m3, [r4 - 10 * 16] ; [6]
7664 pmulhrsw m5, m7
7665 packuswb m4, m5
7666 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
7667 pmulhrsw m5, m7
7668 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
7669 pmulhrsw m6, m7
7670 packuswb m5, m6
7671 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
7672 pmulhrsw m6, m7
7673 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
7674 pmulhrsw m1, m7
7675 packuswb m6, m1
7676 movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7677 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7678 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
7679 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7680 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
7681 pmaddubsw m1, m0, [r4 + 9 * 16] ; [25]
7682 pmulhrsw m1, m7
7683 pmaddubsw m3, m0, [r4] ; [16]
7684 pmulhrsw m3, m7
7685 packuswb m1, m3
7686 mova m3, m0
7687 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7688 pmaddubsw m4, m3, [r4 - 9 * 16] ; [7]
7689 pmulhrsw m4, m7
7690 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
7691 pmulhrsw m3, m7
7692 packuswb m4, m3
7693 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
7694 pmulhrsw m5, m7
7695 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
7696 pmulhrsw m6, m7
7697 packuswb m5, m6
7698 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
7699 pmulhrsw m6, m7
7700 movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7701 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7702 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
7703 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7704 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
7705 pmaddubsw m3, m0, [r4 + 10 * 16] ; [26]
7706 pmulhrsw m3, m7
7707 packuswb m6, m3
7708 pmaddubsw m1, m0, [r4 + 16] ; [17]
7709 pmulhrsw m1, m7
7710 pmaddubsw m3, m0, [r4 - 8 * 16] ; [8]
7711 pmulhrsw m3, m7
7712 packuswb m1, m3
7713 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7714 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
7715 pmulhrsw m4, m7
7716 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
7717 pmulhrsw m5, m7
7718 packuswb m4, m5
7719 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
7720 pmulhrsw m5, m7
7721 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7722 pmulhrsw m6, m7
7723 packuswb m5, m6
7724 movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7725 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7726 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7727 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
7728 pmulhrsw m6, m7
7729 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
7730 pmulhrsw m1, m7
7731 packuswb m6, m1
7732 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
7733 pmulhrsw m1, m7
7734 movu m0, [pb_fact0]
7735 pshufb m2, m0
7736 pmovzxbw m2, m2
7737 packuswb m1, m2
7738 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7739%endmacro
7740;-----------------------------------------------------------------------------------------------------------------
7741; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7742;-----------------------------------------------------------------------------------------------------------------
7743INIT_XMM sse4
7744cglobal intra_pred_ang32_13, 4,7,8,0-(1*mmsize)
7745%define above [rsp + 0 * mmsize]
7746 lea r4, [ang_table + 16 * 16]
7747 lea r5, [r1 * 3] ; r5 -> 3 * stride
7748 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7749 mova m7, [pw_1024]
7750
7751 MODE_13_23_ROW0 1
7752 lea r0, [r6 + r1 * 4]
7753 lea r6, [r6 + r1 * 8]
7754 add r2, 7
7755 mov r3, 3
7756.loop:
7757 MODE_13_23 1
7758 lea r0, [r6 + r1 * 4]
7759 lea r6, [r6 + r1 * 8]
7760 add r2, 8
7761 dec r3
7762 jnz .loop
7763 RET
7764
7765;-------------------------------------------------------------------------------------------------------------------
7766; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7767;-------------------------------------------------------------------------------------------------------------------
7768INIT_XMM sse4
7769cglobal intra_pred_ang32_14, 4,7,8
7770 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7771 mov r6, rsp
7772 sub rsp, 64+gprsize
7773 and rsp, ~63
7774 mov [rsp+64], r6
7775
7776 ; collect reference pixel
7777 movu m0, [r3]
7778 movu m1, [r3 + 15]
7779 pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
7780 pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
7781 pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
7782 palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
7783 mova [rsp], m0
7784 movu m0, [r2 + 1]
7785 movu m1, [r2 + 1 + 16]
7786 movu [rsp + 13], m0
7787 movu [rsp + 13 + 16], m1
7788 mov [rsp + 63], byte 4
7789
7790 ; filter
7791 lea r2, [rsp + 13] ; r2 -> [0]
7792 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7793 lea r4, [ang_table] ; r4 -> ang_table
7794 lea r5, [r1 * 3] ; r5 -> 3 * stride
7795 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7796 mova m5, [pw_1024] ; m5 -> 1024
7797 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7798
7799.loop:
7800 ; Row[0 - 7]
7801 movu m7, [r2 - 4]
7802 palignr m0, m7, 3
7803 mova m1, m0
7804 palignr m2, m7, 2
7805 mova m3, m2
7806 palignr m4, m7, 1
7807 mova m5, m4
7808 mova m6, m4
7809 PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24
7810
7811 ; Row[8 - 15]
7812 movu m7, [r2 - 7]
7813 palignr m0, m7, 3
7814 palignr m1, m7, 2
7815 mova m2, m1
7816 mova m3, m1
7817 palignr m4, m7, 1
7818 mova m5, m4
7819 mova m6, m7
7820 PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16
7821
7822 ; Row[16 - 23]
7823 movu m7, [r2 - 10]
7824 palignr m0, m7, 3
7825 palignr m1, m7, 2
7826 mova m2, m1
7827 palignr m3, m7, 1
7828 mova m4, m3
7829 mova m5, m3
7830 mova m6, m7
7831 PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8
7832
7833 ; Row[24 - 31]
7834 movu m7, [r2 - 13]
7835 palignr m0, m7, 2
7836 mova m1, m0
7837 mova m2, m0
7838 palignr m3, m7, 1
7839 mova m4, m3
7840 mova m5, m7
7841 mova m6, m7
7842 PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0
7843
7844 lea r0, [r6 + r1 * 4]
7845 lea r6, [r6 + r1 * 8]
7846 add r2, 8
7847 dec byte [rsp + 63]
7848 jnz .loop
7849 mov rsp, [rsp+64]
7850 RET
7851
7852;-------------------------------------------------------------------------------------------------------------------
7853; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7854;-------------------------------------------------------------------------------------------------------------------
7855INIT_XMM sse4
7856cglobal intra_pred_ang32_15, 4,7,8
7857 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7858 mov r6, rsp
7859 sub rsp, 64+gprsize
7860 and rsp, ~63
7861 mov [rsp+64], r6
7862
7863 ; collect reference pixel
7864 movu m0, [r3]
7865 movu m1, [r3 + 15]
7866 pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
7867 pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
7868 mova [rsp], m1
7869 movu [rsp + 8], m0
7870 movu m0, [r2 + 1]
7871 movu m1, [r2 + 1 + 16]
7872 movu [rsp + 17], m0
7873 movu [rsp + 17 + 16], m1
7874 mov [rsp + 63], byte 4
7875
7876 ; filter
7877 lea r2, [rsp + 17] ; r2 -> [0]
7878 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7879 lea r4, [ang_table] ; r4 -> ang_table
7880 lea r5, [r1 * 3] ; r5 -> 3 * stride
7881 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7882 mova m5, [pw_1024] ; m5 -> 1024
7883 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7884
7885.loop:
7886 ; Row[0 - 7]
7887 movu m7, [r2 - 5]
7888 palignr m0, m7, 4
7889 palignr m1, m7, 3
7890 mova m2, m1
7891 palignr m3, m7, 2
7892 mova m4, m3
7893 palignr m5, m7, 1
7894 mova m6, m5
7895 PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24
7896
7897 ; Row[8 - 15]
7898 movu m7, [r2 - 9]
7899 palignr m0, m7, 4
7900 palignr m1, m7, 3
7901 mova m2, m1
7902 palignr m3, m7, 2
7903 mova m4, m3
7904 palignr m5, m7, 1
7905 mova m6, m5
7906 PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16
7907
7908 ; Row[16 - 23]
7909 movu m7, [r2 - 13]
7910 palignr m0, m7, 3
7911 mova m1, m0
7912 palignr m2, m7, 2
7913 mova m3, m2
7914 palignr m4, m7, 1
7915 mova m5, m4
7916 mova m6, m7
7917 PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8
7918
7919 ; Row[24 - 31]
7920 movu m7, [r2 - 17]
7921 palignr m0, m7, 3
7922 mova m1, m0
7923 palignr m2, m7, 2
7924 mova m3, m2
7925 palignr m4, m7, 1
7926 mova m5, m4
7927 mova m6, m7
7928 PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0
7929
7930 lea r0, [r6 + r1 * 4]
7931 lea r6, [r6 + r1 * 8]
7932 add r2, 8
7933 dec byte [rsp + 63]
7934 jnz .loop
7935 mov rsp, [rsp+64]
7936 RET
7937
7938;-------------------------------------------------------------------------------------------------------------------
7939; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7940;-------------------------------------------------------------------------------------------------------------------
7941INIT_XMM sse4
7942cglobal intra_pred_ang32_16, 4,7,8
7943 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7944 mov r6, rsp
7945 sub rsp, 64+gprsize
7946 and rsp, ~63
7947 mov [rsp+64], r6
7948
7949 ; collect reference pixel
7950 movu m0, [r3]
7951 movu m1, [r3 + 15]
7952 pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
7953 pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
7954 mova [rsp], m1
7955 movu [rsp + 10], m0
7956 movu m0, [r2 + 1]
7957 movu m1, [r2 + 1 + 16]
7958 movu [rsp + 21], m0
7959 movu [rsp + 21 + 16], m1
7960 mov [rsp + 63], byte 4
7961
7962 ; filter
7963 lea r2, [rsp + 21] ; r2 -> [0]
7964 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7965 lea r4, [ang_table] ; r4 -> ang_table
7966 lea r5, [r1 * 3] ; r5 -> 3 * stride
7967 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7968 mova m5, [pw_1024] ; m5 -> 1024
7969 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7970
7971.loop:
7972 ; Row[0 - 7]
7973 movu m7, [r2 - 6]
7974 palignr m0, m7, 5
7975 palignr m1, m7, 4
7976 mova m2, m1
7977 palignr m3, m7, 3
7978 palignr m4, m7, 2
7979 mova m5, m4
7980 palignr m6, m7, 1
7981 PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24
7982
7983 ; Row[8 - 15]
7984 movu m7, [r2 - 11]
7985 palignr m0, m7, 5
7986 palignr m1, m7, 4
7987 palignr m2, m7, 3
7988 mova m3, m2
7989 palignr m4, m7, 2
7990 palignr m5, m7, 1
7991 mova m6, m5
7992 PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16
7993
7994 ; Row[16 - 23]
7995 movu m7, [r2 - 16]
7996 palignr m0, m7, 4
7997 mova m1, m0
7998 palignr m2, m7, 3
7999 palignr m3, m7, 2
8000 mova m4, m3
8001 palignr m5, m7, 1
8002 mova m6, m7
8003 PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8
8004
8005 ; Row[24 - 31]
8006 movu m7, [r2 - 21]
8007 palignr m0, m7, 4
8008 palignr m1, m7, 3
8009 mova m2, m1
8010 palignr m3, m7, 2
8011 palignr m4, m7, 1
8012 mova m5, m4
8013 mova m6, m7
8014 PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0
8015
8016 lea r0, [r6 + r1 * 4]
8017 lea r6, [r6 + r1 * 8]
8018 add r2, 8
8019 dec byte [rsp + 63]
8020 jnz .loop
8021 mov rsp, [rsp+64]
8022 RET
8023
8024;------------------------------------------------------------------------------------------------------------------
8025; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8026;------------------------------------------------------------------------------------------------------------------
8027INIT_XMM sse4
8028cglobal intra_pred_ang32_17, 4,7,8
8029 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8030 mov r6, rsp
8031 sub rsp, 64+gprsize
8032 and rsp, ~63
8033 mov [rsp+64], r6
8034
8035 ; collect reference pixel
8036 movu m0, [r3]
8037 movu m1, [r3 + 16]
8038 pshufb m0, [c_mode32_17_0]
8039 pshufb m1, [c_mode32_17_0]
8040 mova [rsp ], m1
8041 movu [rsp + 13], m0
8042 movu m0, [r2 + 1]
8043 movu m1, [r2 + 1 + 16]
8044 movu [rsp + 26], m0
8045 movu [rsp + 26 + 16], m1
8046 mov [rsp + 63], byte 4
8047
8048 ; filter
8049 lea r2, [rsp + 25] ; r2 -> [0]
8050 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8051 lea r4, [ang_table] ; r4 -> ang_table
8052 lea r5, [r1 * 3] ; r5 -> 3 * stride
8053 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
8054 mova m5, [pw_1024] ; m5 -> 1024
8055 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8056
8057.loop:
8058 ; Row[0 - 7]
8059 movu m7, [r2 - 6]
8060 palignr m0, m7, 6
8061 palignr m1, m7, 5
8062 palignr m2, m7, 4
8063 palignr m3, m7, 3
8064 palignr m4, m7, 2
8065 mova m5, m4
8066 palignr m6, m7, 1
8067 PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16
8068
8069 ; Row[7 - 15]
8070 movu m7, [r2 - 12]
8071 palignr m0, m7, 5
8072 palignr m1, m7, 4
8073 mova m2, m1
8074 palignr m3, m7, 3
8075 palignr m4, m7, 2
8076 palignr m5, m7, 1
8077 mova m6, m7
8078 PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0
8079
8080 ; Row[16 - 23]
8081 movu m7, [r2 - 19]
8082 palignr m0, m7, 6
8083 palignr m1, m7, 5
8084 palignr m2, m7, 4
8085 palignr m3, m7, 3
8086 palignr m4, m7, 2
8087 mova m5, m4
8088 palignr m6, m7, 1
8089 PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16
8090
8091 ; Row[24 - 31]
8092 movu m7, [r2 - 25]
8093 palignr m0, m7, 5
8094 palignr m1, m7, 4
8095 mova m2, m1
8096 palignr m3, m7, 3
8097 palignr m4, m7, 2
8098 palignr m5, m7, 1
8099 mova m6, m7
8100 PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0
8101
8102 lea r0, [r6 + r1 * 4]
8103 lea r6, [r6 + r1 * 8]
8104 add r2, 8
8105 dec byte [rsp + 63]
8106 jnz .loop
8107 mov rsp, [rsp+64]
8108
8109 RET
8110
8111;-------------------------------------------------------------------------------------------------------------------
8112; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8113;-------------------------------------------------------------------------------------------------------------------
8114INIT_XMM sse4
8115cglobal intra_pred_ang32_18, 4,5,5
8116 movu m0, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
8117 movu m1, [r3 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16]
8118 movu m2, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
8119 movu m3, [r2 + 17] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
8120
8121 lea r2, [r1 * 2]
8122 lea r3, [r1 * 3]
8123 lea r4, [r1 * 4]
8124
8125 movu [r0], m0
8126 movu [r0 + 16], m1
8127
8128 pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
8129 pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32]
8130
8131 palignr m4, m0, m2, 15
8132 movu [r0 + r1], m4
8133 palignr m4, m1, m0, 15
8134 movu [r0 + r1 + 16], m4
8135 palignr m4, m0, m2, 14
8136 movu [r0 + r2], m4
8137 palignr m4, m1, m0, 14
8138 movu [r0 + r2 + 16], m4
8139 palignr m4, m0, m2, 13
8140 movu [r0 + r3], m4
8141 palignr m4, m1, m0, 13
8142 movu [r0 + r3 + 16], m4
8143
8144 lea r0, [r0 + r4]
8145
8146 palignr m4, m0, m2, 12
8147 movu [r0], m4
8148 palignr m4, m1, m0, 12
8149 movu [r0 + 16], m4
8150 palignr m4, m0, m2, 11
8151 movu [r0 + r1], m4
8152 palignr m4, m1, m0, 11
8153 movu [r0 + r1 + 16], m4
8154 palignr m4, m0, m2, 10
8155 movu [r0 + r2], m4
8156 palignr m4, m1, m0, 10
8157 movu [r0 + r2 + 16], m4
8158 palignr m4, m0, m2, 9
8159 movu [r0 + r3], m4
8160 palignr m4, m1, m0, 9
8161 movu [r0 + r3 + 16], m4
8162
8163 lea r0, [r0 + r4]
8164
8165 palignr m4, m0, m2, 8
8166 movu [r0], m4
8167 palignr m4, m1, m0, 8
8168 movu [r0 + 16], m4
8169 palignr m4, m0, m2, 7
8170 movu [r0 + r1], m4
8171 palignr m4, m1, m0, 7
8172 movu [r0 + r1 + 16], m4
8173 palignr m4, m0, m2, 6
8174 movu [r0 + r2], m4
8175 palignr m4, m1, m0, 6
8176 movu [r0 + r2 + 16], m4
8177 palignr m4, m0, m2, 5
8178 movu [r0 + r3], m4
8179 palignr m4, m1, m0, 5
8180 movu [r0 + r3 + 16], m4
8181
8182 lea r0, [r0 + r4]
8183
8184 palignr m4, m0, m2, 4
8185 movu [r0], m4
8186 palignr m4, m1, m0, 4
8187 movu [r0 + 16], m4
8188 palignr m4, m0, m2, 3
8189 movu [r0 + r1], m4
8190 palignr m4, m1, m0, 3
8191 movu [r0 + r1 + 16], m4
8192 palignr m4, m0, m2, 2
8193 movu [r0 + r2], m4
8194 palignr m4, m1, m0, 2
8195 movu [r0 + r2 + 16], m4
8196 palignr m4, m0, m2, 1
8197 movu [r0 + r3], m4
8198 palignr m4, m1, m0, 1
8199 movu [r0 + r3 + 16], m4
8200
8201 lea r0, [r0 + r4]
8202
8203 movu [r0], m2
8204 movu [r0 + 16], m0
8205 palignr m4, m2, m3, 15
8206 movu [r0 + r1], m4
8207 palignr m4, m0, m2, 15
8208 movu [r0 + r1 + 16], m4
8209 palignr m4, m2, m3, 14
8210 movu [r0 + r2], m4
8211 palignr m4, m0, m2, 14
8212 movu [r0 + r2 + 16], m4
8213 palignr m4, m2, m3, 13
8214 movu [r0 + r3], m4
8215 palignr m4, m0, m2, 13
8216 movu [r0 + r3 + 16], m4
8217
8218 lea r0, [r0 + r4]
8219
8220 palignr m4, m2, m3, 12
8221 movu [r0], m4
8222 palignr m4, m0, m2, 12
8223 movu [r0 + 16], m4
8224 palignr m4, m2, m3, 11
8225 movu [r0 + r1], m4
8226 palignr m4, m0, m2, 11
8227 movu [r0 + r1 + 16], m4
8228 palignr m4, m2, m3, 10
8229 movu [r0 + r2], m4
8230 palignr m4, m0, m2, 10
8231 movu [r0 + r2 + 16], m4
8232 palignr m4, m2, m3, 9
8233 movu [r0 + r3], m4
8234 palignr m4, m0, m2, 9
8235 movu [r0 + r3 + 16], m4
8236
8237 lea r0, [r0 + r4]
8238
8239 palignr m4, m2, m3, 8
8240 movu [r0], m4
8241 palignr m4, m0, m2, 8
8242 movu [r0 + 16], m4
8243 palignr m4, m2, m3, 7
8244 movu [r0 + r1], m4
8245 palignr m4, m0, m2, 7
8246 movu [r0 + r1 + 16], m4
8247 palignr m4, m2, m3, 6
8248 movu [r0 + r2], m4
8249 palignr m4, m0, m2, 6
8250 movu [r0 + r2 + 16], m4
8251 palignr m4, m2, m3, 5
8252 movu [r0 + r3], m4
8253 palignr m4, m0, m2, 5
8254 movu [r0 + r3 + 16], m4
8255
8256 lea r0, [r0 + r4]
8257
8258 palignr m4, m2, m3, 4
8259 movu [r0], m4
8260 palignr m4, m0, m2, 4
8261 movu [r0 + 16], m4
8262 palignr m4, m2, m3, 3
8263 movu [r0 + r1], m4
8264 palignr m4, m0, m2, 3
8265 movu [r0 + r1 + 16], m4
8266 palignr m4, m2, m3, 2
8267 movu [r0 + r2], m4
8268 palignr m4, m0, m2, 2
8269 movu [r0 + r2 + 16], m4
8270 palignr m4, m2, m3, 1
8271 movu [r0 + r3], m4
8272 palignr m4, m0, m2, 1
8273 movu [r0 + r3 + 16], m4
8274 RET
8275
8276;------------------------------------------------------------------------------------------------------------------
8277; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8278;------------------------------------------------------------------------------------------------------------------
8279INIT_XMM sse4
8280cglobal intra_pred_ang32_19, 4,7,8
8281 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8282 xchg r2, r3
8283 mov r6, rsp
8284 sub rsp, 64+gprsize
8285 and rsp, ~63
8286 mov [rsp+64], r6
8287
8288 ; collect reference pixel
8289 movu m0, [r3]
8290 movu m1, [r3 + 16]
8291 pshufb m0, [c_mode32_17_0]
8292 pshufb m1, [c_mode32_17_0]
8293 mova [rsp ], m1
8294 movu [rsp + 13], m0
8295 movu m0, [r2 + 1]
8296 movu m1, [r2 + 1 + 16]
8297 movu [rsp + 26], m0
8298 movu [rsp + 26 + 16], m1
8299 mov [rsp + 63], byte 4
8300
8301 ; filter
8302 lea r2, [rsp + 25] ; r2 -> [0]
8303 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8304 lea r4, [ang_table] ; r4 -> ang_table
8305 lea r5, [r1 * 3] ; r5 -> 3 * stride
8306 lea r6, [r0] ; r6 -> r0
8307 mova m5, [pw_1024] ; m5 -> 1024
8308 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8309
8310.loop:
8311 ; Row[0 - 7]
8312 movu m7, [r2 - 6]
8313 palignr m0, m7, 6
8314 palignr m1, m7, 5
8315 palignr m2, m7, 4
8316 palignr m3, m7, 3
8317 palignr m4, m7, 2
8318 mova m5, m4
8319 palignr m6, m7, 1
8320 PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16
8321
8322 ; Row[7 - 15]
8323 movu m7, [r2 - 12]
8324 palignr m0, m7, 5
8325 palignr m1, m7, 4
8326 mova m2, m1
8327 palignr m3, m7, 3
8328 palignr m4, m7, 2
8329 palignr m5, m7, 1
8330 mova m6, m7
8331 lea r0, [r0 + r1 * 4]
8332 PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0
8333
8334 ; Row[16 - 23]
8335 movu m7, [r2 - 19]
8336 palignr m0, m7, 6
8337 palignr m1, m7, 5
8338 palignr m2, m7, 4
8339 palignr m3, m7, 3
8340 palignr m4, m7, 2
8341 mova m5, m4
8342 palignr m6, m7, 1
8343 lea r0, [r0 + r1 * 4]
8344 PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16
8345
8346 ; Row[24 - 31]
8347 movu m7, [r2 - 25]
8348 palignr m0, m7, 5
8349 palignr m1, m7, 4
8350 mova m2, m1
8351 palignr m3, m7, 3
8352 palignr m4, m7, 2
8353 palignr m5, m7, 1
8354 mova m6, m7
8355 lea r0, [r0 + r1 * 4]
8356 PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0
8357
8358 add r6, 8
8359 mov r0, r6
8360 add r2, 8
8361 dec byte [rsp + 63]
8362 jnz .loop
8363 mov rsp, [rsp+64]
8364 RET
8365
8366;-------------------------------------------------------------------------------------------------------------------
8367; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8368;-------------------------------------------------------------------------------------------------------------------
8369INIT_XMM sse4
8370cglobal intra_pred_ang32_20, 4,7,8
8371 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8372 xchg r2, r3
8373 mov r6, rsp
8374 sub rsp, 64+gprsize
8375 and rsp, ~63
8376 mov [rsp+64], r6
8377
8378 ; collect reference pixel
8379 movu m0, [r3]
8380 movu m1, [r3 + 15]
8381 pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
8382 pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
8383 mova [rsp], m1
8384 movu [rsp + 10], m0
8385 movu m0, [r2 + 1]
8386 movu m1, [r2 + 1 + 16]
8387 movu [rsp + 21], m0
8388 movu [rsp + 21 + 16], m1
8389 mov [rsp + 63], byte 4
8390
8391 ; filter
8392 lea r2, [rsp + 21] ; r2 -> [0]
8393 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8394 lea r4, [ang_table] ; r4 -> ang_table
8395 lea r5, [r1 * 3] ; r5 -> 3 * stride
8396 lea r6, [r0] ; r6 -> r0
8397 mova m5, [pw_1024] ; m5 -> 1024
8398 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8399
8400.loop:
8401 ; Row[0 - 7]
8402 movu m7, [r2 - 6]
8403 palignr m0, m7, 5
8404 palignr m1, m7, 4
8405 mova m2, m1
8406 palignr m3, m7, 3
8407 palignr m4, m7, 2
8408 mova m5, m4
8409 palignr m6, m7, 1
8410 PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24
8411
8412 ; Row[8 - 15]
8413 movu m7, [r2 - 11]
8414 palignr m0, m7, 5
8415 palignr m1, m7, 4
8416 palignr m2, m7, 3
8417 mova m3, m2
8418 palignr m4, m7, 2
8419 palignr m5, m7, 1
8420 mova m6, m5
8421 lea r0, [r0 + r1 * 4]
8422 PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16
8423
8424 ; Row[16 - 23]
8425 movu m7, [r2 - 16]
8426 palignr m0, m7, 4
8427 mova m1, m0
8428 palignr m2, m7, 3
8429 palignr m3, m7, 2
8430 mova m4, m3
8431 palignr m5, m7, 1
8432 mova m6, m7
8433 lea r0, [r0 + r1 * 4]
8434 PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8
8435
8436 ; Row[24 - 31]
8437 movu m7, [r2 - 21]
8438 palignr m0, m7, 4
8439 palignr m1, m7, 3
8440 mova m2, m1
8441 palignr m3, m7, 2
8442 palignr m4, m7, 1
8443 mova m5, m4
8444 mova m6, m7
8445 lea r0, [r0 + r1 * 4]
8446 PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0
8447
8448 add r6, 8
8449 mov r0, r6
8450 add r2, 8
8451 dec byte [rsp + 63]
8452 jnz .loop
8453 mov rsp, [rsp+64]
8454 RET
8455
8456;-------------------------------------------------------------------------------------------------------------------
8457; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8458;-------------------------------------------------------------------------------------------------------------------
8459INIT_XMM sse4
8460cglobal intra_pred_ang32_21, 4,7,8
8461 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8462 xchg r2, r3
8463 mov r6, rsp
8464 sub rsp, 64+gprsize
8465 and rsp, ~63
8466 mov [rsp+64], r6
8467
8468 ; collect reference pixel
8469 movu m0, [r3]
8470 movu m1, [r3 + 15]
8471 pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
8472 pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
8473 mova [rsp], m1
8474 movu [rsp + 8], m0
8475 movu m0, [r2 + 1]
8476 movu m1, [r2 + 1 + 16]
8477 movu [rsp + 17], m0
8478 movu [rsp + 17 + 16], m1
8479 mov [rsp + 63], byte 4
8480
8481 ; filter
8482 lea r2, [rsp + 17] ; r2 -> [0]
8483 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8484 lea r4, [ang_table] ; r4 -> ang_table
8485 lea r5, [r1 * 3] ; r5 -> 3 * stride
8486 lea r6, [r0] ; r6 -> r0
8487 mova m5, [pw_1024] ; m5 -> 1024
8488 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8489
8490.loop:
8491 ; Row[0 - 7]
8492 movu m7, [r2 - 5]
8493 palignr m0, m7, 4
8494 palignr m1, m7, 3
8495 mova m2, m1
8496 palignr m3, m7, 2
8497 mova m4, m3
8498 palignr m5, m7, 1
8499 mova m6, m5
8500 PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24
8501
8502 ; Row[8 - 15]
8503 movu m7, [r2 - 9]
8504 palignr m0, m7, 4
8505 palignr m1, m7, 3
8506 mova m2, m1
8507 palignr m3, m7, 2
8508 mova m4, m3
8509 palignr m5, m7, 1
8510 mova m6, m5
8511 lea r0, [r0 + r1 * 4]
8512 PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16
8513
8514 ; Row[16 - 23]
8515 movu m7, [r2 - 13]
8516 palignr m0, m7, 3
8517 mova m1, m0
8518 palignr m2, m7, 2
8519 mova m3, m2
8520 palignr m4, m7, 1
8521 mova m5, m4
8522 mova m6, m7
8523 lea r0, [r0 + r1 * 4]
8524 PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8
8525
8526 ; Row[24 - 31]
8527 movu m7, [r2 - 17]
8528 palignr m0, m7, 3
8529 mova m1, m0
8530 palignr m2, m7, 2
8531 mova m3, m2
8532 palignr m4, m7, 1
8533 mova m5, m4
8534 mova m6, m7
8535 lea r0, [r0 + r1 * 4]
8536 PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0
8537
8538 add r6, 8
8539 mov r0, r6
8540 add r2, 8
8541 dec byte [rsp + 63]
8542 jnz .loop
8543 mov rsp, [rsp+64]
8544 RET
8545
8546;-------------------------------------------------------------------------------------------------------------------
8547; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8548;-------------------------------------------------------------------------------------------------------------------
8549INIT_XMM sse4
8550cglobal intra_pred_ang32_22, 4,7,8
8551 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8552
8553 xchg r2, r3
8554 mov r6, rsp
8555 sub rsp, 64+gprsize
8556 and rsp, ~63
8557 mov [rsp+64], r6
8558
8559 ; collect reference pixel
8560 movu m0, [r3]
8561 movu m1, [r3 + 15]
8562 pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
8563 pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
8564 pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
8565 palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
8566 mova [rsp], m0
8567 movu m0, [r2 + 1]
8568 movu m1, [r2 + 1 + 16]
8569 movu [rsp + 13], m0
8570 movu [rsp + 13 + 16], m1
8571 mov [rsp + 63], byte 4
8572
8573 ; filter
8574 lea r2, [rsp + 13] ; r2 -> [0]
8575 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8576 lea r4, [ang_table] ; r4 -> ang_table
8577 lea r5, [r1 * 3] ; r5 -> 3 * stride
8578 lea r6, [r0] ; r6 -> r0
8579 mova m5, [pw_1024] ; m5 -> 1024
8580 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8581
8582.loop:
8583 ; Row[0 - 7]
8584 movu m7, [r2 - 4]
8585 palignr m0, m7, 3
8586 mova m1, m0
8587 palignr m2, m7, 2
8588 mova m3, m2
8589 palignr m4, m7, 1
8590 mova m5, m4
8591 mova m6, m4
8592 PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24
8593
8594 ; Row[8 - 15]
8595 movu m7, [r2 - 7]
8596 palignr m0, m7, 3
8597 palignr m1, m7, 2
8598 mova m2, m1
8599 mova m3, m1
8600 palignr m4, m7, 1
8601 mova m5, m4
8602 mova m6, m7
8603 lea r0, [r0 + r1 * 4]
8604 PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16
8605
8606 ; Row[16 - 23]
8607 movu m7, [r2 - 10]
8608 palignr m0, m7, 3
8609 palignr m1, m7, 2
8610 mova m2, m1
8611 palignr m3, m7, 1
8612 mova m4, m3
8613 mova m5, m3
8614 mova m6, m7
8615 lea r0, [r0 + r1 * 4]
8616 PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8
8617
8618 ; Row[24 - 31]
8619 movu m7, [r2 - 13]
8620 palignr m0, m7, 2
8621 mova m1, m0
8622 mova m2, m0
8623 palignr m3, m7, 1
8624 mova m4, m3
8625 mova m5, m7
8626 mova m6, m7
8627 lea r0, [r0 + r1 * 4]
8628 PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0
8629
8630 add r6, 8
8631 mov r0, r6
8632 add r2, 8
8633 dec byte [rsp + 63]
8634 jnz .loop
8635 mov rsp, [rsp+64]
8636 RET
8637
8638;-----------------------------------------------------------------------------------------------------------------
8639; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8640;-----------------------------------------------------------------------------------------------------------------
8641INIT_XMM sse4
8642cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
8643%define above [rsp + 0 * mmsize]
8644 xchg r2, r3
8645 lea r4, [ang_table + 16 * 16]
8646 lea r5, [r1 * 3] ; r5 -> 3 * stride
8647 mov r6, r0
8648 mova m7, [pw_1024]
8649
8650 MODE_13_23_ROW0 0
8651 add r6, 8
8652 mov r0, r6
8653 add r2, 7
8654 mov r3, 3
8655.loop:
8656 MODE_13_23 0
8657 add r6, 8
8658 mov r0, r6
8659 add r2, 8
8660 dec r3
8661 jnz .loop
8662 RET
8663
8664;-----------------------------------------------------------------------------------------------------------------
8665; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8666;-----------------------------------------------------------------------------------------------------------------
8667INIT_XMM sse4
8668cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize)
8669 %define above [rsp + 0 * mmsize]
8670 xchg r2, r3
8671 lea r4, [ang_table + 16 * 16]
8672 lea r5, [r1 * 3] ; r5 -> 3 * stride
8673 mov r6, r0
8674 mova m7, [pw_1024]
8675
8676 MODE_12_24_ROW0 0
8677 add r6, 8
8678 mov r0, r6
8679 add r2, 7
8680 mov r3, 3
8681.loop:
8682 MODE_12_24 0
8683 add r6, 8
8684 mov r0, r6
8685 add r2, 8
8686 dec r3
8687 jnz .loop
8688 RET
8689
8690;-------------------------------------------------------------------------------------------------------------------
8691; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8692;-------------------------------------------------------------------------------------------------------------------
8693INIT_XMM sse4
8694cglobal intra_pred_ang32_25, 4,7,8
8695 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8696 xchg r2, r3
8697 mov r6, rsp
8698 sub rsp, 64+gprsize
8699 and rsp, ~63
8700 mov [rsp+64], r6
8701
8702 ; collect reference pixel
8703 movu m0, [r3 + 16]
8704 pxor m1, m1
8705 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
8706 mova [rsp], m0
8707 movu m0, [r2]
8708 movu m1, [r2 + 16]
8709 movu m2, [r2 + 32]
8710 movu [rsp + 1], m0
8711 movu [rsp + 1 + 16], m1
8712 movu [rsp + 1 + 32], m2
8713 mov [rsp + 63], byte 4
8714
8715 ; filter
8716 lea r2, [rsp + 1] ; r2 -> [0]
8717 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8718 lea r4, [ang_table] ; r4 -> ang_table
8719 lea r5, [r1 * 3] ; r5 -> 3 * stride
8720 lea r6, [r0] ; r6 -> r0
8721 mova m5, [pw_1024] ; m5 -> 1024
8722 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8723
8724.loop:
8725 ; Row[0 - 7]
8726 movu m7, [r2]
8727 mova m0, m7
8728 mova m1, m7
8729 mova m2, m7
8730 mova m3, m7
8731 mova m4, m7
8732 mova m5, m7
8733 mova m6, m7
8734 PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16
8735
8736 ; Row[8 - 15]
8737 movu m7, [r2]
8738 mova m0, m7
8739 mova m1, m7
8740 mova m2, m7
8741 mova m3, m7
8742 mova m4, m7
8743 mova m5, m7
8744 mova m6, m7
8745 lea r0, [r0 + r1 * 4]
8746 PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0
8747
8748 ; Row[16 - 23]
8749 movu m7, [r2 - 1]
8750 mova m0, m7
8751 mova m1, m7
8752 mova m2, m7
8753 mova m3, m7
8754 mova m4, m7
8755 mova m5, m7
8756 mova m6, m7
8757 lea r0, [r0 + r1 * 4]
8758 PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16
8759
8760 ; Row[24 - 31]
8761 movu m7, [r2 - 1]
8762 mova m0, m7
8763 mova m1, m7
8764 mova m2, m7
8765 mova m3, m7
8766 mova m4, m7
8767 mova m5, m7
8768 mova m6, m7
8769 lea r0, [r0 + r1 * 4]
8770 PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0
8771
8772 add r6, 8
8773 mov r0, r6
8774 add r2, 8
8775 dec byte [rsp + 63]
8776 jnz .loop
8777 mov rsp, [rsp+64]
8778 RET
8779
8780;------------------------------------------------------------------------------------------------------------------
8781; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8782;------------------------------------------------------------------------------------------------------------------
8783INIT_XMM sse4
8784cglobal intra_pred_ang32_26, 6,7,7,0-(2*mmsize)
8785%define m8 [rsp + 0 * mmsize]
8786%define m9 [rsp + 1 * mmsize]
8787 lea r4, [r1 * 3]
8788 mov r6, 2
8789 movu m0, [r2]
8790 movu m1, [r2 + 1]
8791 mova m8, m0
8792 mova m9, m1
8793 mov r2d, r5d
8794
8795.loop:
8796 movu m0, [r3 + 1]
8797
8798 movu [r0], m0
8799 movu [r0 + r1], m0
8800 movu [r0 + r1 * 2], m0
8801 movu [r0 + r4], m0
8802 lea r5, [r0 + r1 * 4]
8803 movu [r5], m0
8804 movu [r5 + r1], m0
8805 movu [r5 + r1 * 2], m0
8806 movu [r5 + r4], m0
8807 lea r5, [r5 + r1 * 4]
8808 movu [r5], m0
8809 movu [r5 + r1], m0
8810 movu [r5 + r1 * 2], m0
8811 movu [r5 + r4], m0
8812 lea r5, [r5 + r1 * 4]
8813 movu [r5], m0
8814 movu [r5 + r1], m0
8815 movu [r5 + r1 * 2], m0
8816 movu [r5 + r4], m0
8817 lea r5, [r0 + r1 * 4]
8818 movu [r5], m0
8819 movu [r5 + r1], m0
8820 movu [r5 + r1 * 2], m0
8821 movu [r5 + r4], m0
8822 lea r5, [r5 + r1 * 4]
8823 movu [r5], m0
8824 movu [r5 + r1], m0
8825 movu [r5 + r1 * 2], m0
8826 movu [r5 + r4], m0
8827 lea r5, [r5 + r1 * 4]
8828 movu [r5], m0
8829 movu [r5 + r1], m0
8830 movu [r5 + r1 * 2], m0
8831 movu [r5 + r4], m0
8832 lea r5, [r5 + r1 * 4]
8833 movu [r5], m0
8834 movu [r5 + r1], m0
8835 movu [r5 + r1 * 2], m0
8836 movu [r5 + r4], m0
8837 lea r5, [r5 + r1 * 4]
8838 movu [r5], m0
8839 movu [r5 + r1], m0
8840 movu [r5 + r1 * 2], m0
8841 movu [r5 + r4], m0
8842 lea r5, [r5 + r1 * 4]
8843 movu [r5], m0
8844 movu [r5 + r1], m0
8845 movu [r5 + r1 * 2], m0
8846 movu [r5 + r4], m0
8847 lea r5, [r5 + r1 * 4]
8848 movu [r5], m0
8849 movu [r5 + r1], m0
8850 movu [r5 + r1 * 2], m0
8851 movu [r5 + r4], m0
8852
8853; filter
8854 cmp r2d, byte 0
8855 jz .quit
8856
8857 pxor m4, m4
8858 pshufb m0, m4
8859 pmovzxbw m0, m0
8860 mova m1, m0
8861 movu m2, m8
8862 movu m3, m9
8863
8864 pshufb m2, m4
8865 pmovzxbw m2, m2
8866 movhlps m4, m3
8867 pmovzxbw m3, m3
8868 pmovzxbw m4, m4
8869 psubw m3, m2
8870 psubw m4, m2
8871 psraw m3, 1
8872 psraw m4, 1
8873 paddw m0, m3
8874 paddw m1, m4
8875 packuswb m0, m1
8876
8877 pextrb [r0], m0, 0
8878 pextrb [r0 + r1], m0, 1
8879 pextrb [r0 + r1 * 2], m0, 2
8880 pextrb [r0 + r4], m0, 3
8881 lea r5, [r0 + r1 * 4]
8882 pextrb [r5], m0, 4
8883 pextrb [r5 + r1], m0, 5
8884 pextrb [r5 + r1 * 2], m0, 6
8885 pextrb [r5 + r4], m0, 7
8886 lea r5, [r5 + r1 * 4]
8887 pextrb [r5], m0, 8
8888 pextrb [r5 + r1], m0, 9
8889 pextrb [r5 + r1 * 2], m0, 10
8890 pextrb [r5 + r4], m0, 11
8891 lea r5, [r5 + r1 * 4]
8892 pextrb [r5], m0, 12
8893 pextrb [r5 + r1], m0, 13
8894 pextrb [r5 + r1 * 2], m0, 14
8895 pextrb [r5 + r4], m0, 15
8896
8897.quit:
8898 lea r3, [r3 + 16]
8899 add r0, 16
8900 dec r6d
8901 jnz .loop
8902 RET
8903
8904;------------------------------------------------------------------------------------------------------------------
8905; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8906;------------------------------------------------------------------------------------------------------------------
8907INIT_XMM sse4
8908cglobal intra_pred_ang32_27, 3,7,8
8909 mov r2, r3mp
8910 lea r3, [ang_table + 16 * 16]
8911 mov r4d, 4
8912 lea r5, [r1 * 3]
8913 mov r6, r0
8914 mova m7, [pw_1024]
8915.loop:
8916 MODE_9_27 0
8917 add r6, 8
8918 mov r0, r6
8919 add r2, 8
8920 dec r4
8921 jnz .loop
8922 RET
8923
8924;------------------------------------------------------------------------------------------------------------------
8925; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8926;------------------------------------------------------------------------------------------------------------------
8927INIT_XMM sse4
8928cglobal intra_pred_ang32_28, 3,7,8
8929 mov r2, r3mp
8930 lea r3, [ang_table + 16 * 16]
8931 mov r4d, 4
8932 lea r5, [r1 * 3]
8933 mov r6, r0
8934 mova m7, [pw_1024]
8935.loop:
8936 MODE_8_28 0
8937 add r6, 8
8938 mov r0, r6
8939 add r2, 8
8940 dec r4
8941 jnz .loop
8942 RET
8943
8944;------------------------------------------------------------------------------------------------------------------
8945; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8946;------------------------------------------------------------------------------------------------------------------
8947INIT_XMM sse4
8948cglobal intra_pred_ang32_29, 3,7,8
8949 mov r2, r3mp
8950 lea r3, [ang_table + 16 * 16]
8951 mov r4d, 4
8952 lea r5, [r1 * 3]
8953 mov r6, r0
8954 mova m7, [pw_1024]
8955.loop:
8956 MODE_7_29 0
8957 add r6, 8
8958 mov r0, r6
8959 add r2, 8
8960 dec r4
8961 jnz .loop
8962 RET
8963
8964;------------------------------------------------------------------------------------------------------------------
8965; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8966;------------------------------------------------------------------------------------------------------------------
8967INIT_XMM sse4
8968cglobal intra_pred_ang32_30, 3,7,8
8969 mov r2, r3mp
8970 lea r3, [ang_table + 16 * 16]
8971 mov r4d, 4
8972 lea r5, [r1 * 3]
8973 mov r6, r0
8974 mova m7, [pw_1024]
8975.loop:
8976 MODE_6_30 0
8977 add r6, 8
8978 mov r0, r6
8979 add r2, 8
8980 dec r4
8981 jnz .loop
8982 RET
8983
8984;------------------------------------------------------------------------------------------------------------------
8985; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8986;------------------------------------------------------------------------------------------------------------------
8987INIT_XMM sse4
8988cglobal intra_pred_ang32_31, 3,7,8
8989 mov r2, r3mp
8990 lea r3, [ang_table + 16 * 16]
8991 mov r4d, 4
8992 lea r5, [r1 * 3]
8993 mov r6, r0
8994 mova m7, [pw_1024]
8995.loop:
8996 MODE_5_31 0
8997 add r6, 8
8998 mov r0, r6
8999 add r2, 8
9000 dec r4
9001 jnz .loop
9002 RET
9003
9004;-----------------------------------------------------------------------------------------------------------------
9005; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9006;-----------------------------------------------------------------------------------------------------------------
9007INIT_XMM sse4
9008cglobal intra_pred_ang32_32, 3,7,8
9009 mov r2, r3mp
9010 lea r3, [ang_table + 16 * 16]
9011 mov r4d, 4
9012 lea r5, [r1 * 3]
9013 mov r6, r0
9014 mova m7, [pw_1024]
9015.loop:
9016 MODE_4_32 0
9017 add r6, 8
9018 mov r0, r6
9019 add r2, 8
9020 dec r4
9021 jnz .loop
9022 RET
9023
9024;------------------------------------------------------------------------------------------------------------------
9025; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9026;------------------------------------------------------------------------------------------------------------------
9027INIT_XMM sse4
9028cglobal intra_pred_ang32_33, 3,7,8
9029 xchg r2, r3mp
9030 lea r3, [ang_table + 16 * 16]
9031 mov r4d, 4
9032 lea r5, [r1 * 3]
9033 mov r6, r0
9034 mova m7, [pw_1024]
9035.loop:
9036 MODE_3_33 0
9037 add r6, 8
9038 mov r0, r6
9039 add r2, 8
9040 dec r4
9041 jnz .loop
9042 RET
9043
9044;-----------------------------------------------------------------------------
9045; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
9046;-----------------------------------------------------------------------------
9047INIT_XMM sse4
9048cglobal all_angs_pred_4x4, 6, 6, 8
9049
9050; mode 2
9051
9052movh m0, [r2 + 2]
9053movd [r0], m0
9054
9055palignr m1, m0, 1
9056movd [r0 + 4], m1
9057
9058palignr m1, m0, 2
9059movd [r0 + 8], m1
9060
9061psrldq m0, 3
9062movd [r0 + 12], m0
9063
9064; mode 3
9065
9066mova m0, [pw_1024]
9067
9068movh m1, [r2 + 1]
9069
9070palignr m2, m1, 1
9071punpcklbw m1, m2
9072
9073lea r5, [ang_table]
9074
9075pmaddubsw m5, m1, [r5 + 26 * 16]
9076pmulhrsw m5, m0
9077packuswb m5, m5
9078movd [r0 + 16], m5
9079
9080palignr m2, m1, 2
9081
9082mova m7, [r5 + 20 * 16]
9083
9084pmaddubsw m6, m2, m7
9085pmulhrsw m6, m0
9086packuswb m6, m6
9087movd [r0 + 20], m6
9088
9089palignr m3, m1, 4
9090
9091pmaddubsw m4, m3, [r5 + 14 * 16]
9092pmulhrsw m4, m0
9093packuswb m4, m4
9094movd [r0 + 24], m4
9095
9096palignr m4, m1, 6
9097
9098pmaddubsw m4, [r5 + 8 * 16]
9099pmulhrsw m4, m0
9100packuswb m4, m4
9101movd [r0 + 28], m4
9102
9103; mode 4
9104
9105pmaddubsw m4, m1, [r5 + 21 * 16]
9106pmulhrsw m4, m0
9107packuswb m4, m4
9108movd [r0 + 32], m4
9109
9110pmaddubsw m4, m2, [r5 + 10 * 16]
9111pmulhrsw m4, m0
9112packuswb m4, m4
9113movd [r0 + 36], m4
9114
9115pmaddubsw m4, m2, [r5 + 31 * 16]
9116pmulhrsw m4, m0
9117packuswb m4, m4
9118movd [r0 + 40], m4
9119
9120pmaddubsw m4, m3, m7
9121pmulhrsw m4, m0
9122packuswb m4, m4
9123movd [r0 + 44], m4
9124
9125; mode 5
9126
9127pmaddubsw m4, m1, [r5 + 17 * 16]
9128pmulhrsw m4, m0
9129packuswb m4, m4
9130movd [r0 + 48], m4
9131
9132pmaddubsw m4, m2, [r5 + 2 * 16]
9133pmulhrsw m4, m0
9134packuswb m4, m4
9135movd [r0 + 52], m4
9136
9137pmaddubsw m4, m2, [r5 + 19 * 16]
9138pmulhrsw m4, m0
9139packuswb m4, m4
9140movd [r0 + 56], m4
9141
9142pmaddubsw m3, [r5 + 4 * 16]
9143pmulhrsw m3, m0
9144packuswb m3, m3
9145movd [r0 + 60], m3
9146
9147; mode 6
9148
9149pmaddubsw m3, m1, [r5 + 13 * 16]
9150pmulhrsw m3, m0
9151packuswb m3, m3
9152movd [r0 + 64], m3
9153
9154movd [r0 + 68], m5
9155
9156pmaddubsw m3, m2, [r5 + 7 * 16]
9157pmulhrsw m3, m0
9158packuswb m3, m3
9159movd [r0 + 72], m3
9160
9161movd [r0 + 76], m6
9162
9163; mode 7
9164
9165pmaddubsw m3, m1, [r5 + 9 * 16]
9166pmulhrsw m3, m0
9167packuswb m3, m3
9168movd [r0 + 80], m3
9169
9170pmaddubsw m3, m1, [r5 + 18 * 16]
9171pmulhrsw m3, m0
9172packuswb m3, m3
9173movd [r0 + 84], m3
9174
9175pmaddubsw m3, m1, [r5 + 27 * 16]
9176pmulhrsw m3, m0
9177packuswb m3, m3
9178movd [r0 + 88], m3
9179
9180pmaddubsw m2, [r5 + 4 * 16]
9181pmulhrsw m2, m0
9182packuswb m2, m2
9183movd [r0 + 92], m2
9184
9185; mode 8
9186
9187pmaddubsw m2, m1, [r5 + 5 * 16]
9188pmulhrsw m2, m0
9189packuswb m2, m2
9190movd [r0 + 96], m2
9191
9192pmaddubsw m2, m1, [r5 + 10 * 16]
9193pmulhrsw m2, m0
9194packuswb m2, m2
9195movd [r0 + 100], m2
9196
9197pmaddubsw m2, m1, [r5 + 15 * 16]
9198pmulhrsw m2, m0
9199packuswb m2, m2
9200movd [r0 + 104], m2
9201
9202pmaddubsw m2, m1, m7
9203pmulhrsw m2, m0
9204packuswb m2, m2
9205movd [r0 + 108], m2
9206
9207; mode 9
9208
9209pmaddubsw m2, m1, [r5 + 2 * 16]
9210pmulhrsw m2, m0
9211packuswb m2, m2
9212movd [r0 + 112], m2
9213
9214pmaddubsw m2, m1, [r5 + 4 * 16]
9215pmulhrsw m2, m0
9216packuswb m2, m2
9217movd [r0 + 116], m2
9218
9219pmaddubsw m2, m1, [r5 + 6 * 16]
9220pmulhrsw m2, m0
9221packuswb m2, m2
9222movd [r0 + 120], m2
9223
9224pmaddubsw m1, [r5 + 8 * 16]
9225pmulhrsw m1, m0
9226packuswb m1, m1
9227movd [r0 + 124], m1
9228
9229; mode 10
9230
9231movh m1, [r2]
9232palignr m2, m1, 1
9233pshufd m3, m2, 0
9234movu [r0 + 128], m3
9235
9236pxor m3, m3
9237
9238pshufb m4, m2, m3
9239punpcklbw m4, m3
9240
9241movh m5, [r1]
9242
9243pshufb m6, m5, m3
9244punpcklbw m6, m3
9245
9246psrldq m5, 1
9247punpcklbw m5, m3
9248
9249psubw m5, m6
9250psraw m5, 1
9251
9252paddw m4, m5
9253
9254packuswb m4, m3
9255
9256pextrb [r0 + 128], m4, 0
9257pextrb [r0 + 132], m4, 1
9258pextrb [r0 + 136], m4, 2
9259pextrb [r0 + 140], m4, 3
9260
9261; mode 11
9262
9263palignr m2, m1, 1
9264punpcklbw m1, m2
9265
9266pmaddubsw m2, m1, [r5 + 30 * 16]
9267pmulhrsw m2, m0
9268packuswb m2, m2
9269movd [r0 + 144], m2
9270
9271pmaddubsw m2, m1, [r5 + 28 * 16]
9272pmulhrsw m2, m0
9273packuswb m2, m2
9274movd [r0 + 148], m2
9275
9276pmaddubsw m2, m1, [r5 + 26 * 16]
9277pmulhrsw m2, m0
9278packuswb m2, m2
9279movd [r0 + 152], m2
9280
9281pmaddubsw m2, m1, [r5 + 24 * 16]
9282pmulhrsw m2, m0
9283packuswb m2, m2
9284movd [r0 + 156], m2
9285
9286; mode 12
9287
9288pmaddubsw m2, m1, [r5 + 27 * 16]
9289pmulhrsw m2, m0
9290packuswb m2, m2
9291movd [r0 + 160], m2
9292
9293pmaddubsw m2, m1, [r5 + 22 * 16]
9294pmulhrsw m2, m0
9295packuswb m2, m2
9296movd [r0 + 164], m2
9297
9298pmaddubsw m2, m1, [r5 + 17 * 16]
9299pmulhrsw m2, m0
9300packuswb m2, m2
9301movd [r0 + 168], m2
9302
9303pmaddubsw m2, m1, [r5 + 12 * 16]
9304pmulhrsw m2, m0
9305packuswb m2, m2
9306movd [r0 + 172], m2
9307
9308; mode 13
9309
9310pmaddubsw m2, m1, [r5 + 23 * 16]
9311pmulhrsw m2, m0
9312packuswb m2, m2
9313movd [r0 + 176], m2
9314
9315pmaddubsw m2, m1, [r5 + 14 * 16]
9316pmulhrsw m2, m0
9317packuswb m2, m2
9318movd [r0 + 180], m2
9319
9320pmaddubsw m2, m1, [r5 + 5 * 16]
9321pmulhrsw m2, m0
9322packuswb m2, m2
9323movd [r0 + 184], m2
9324
9325pslldq m2, m1, 2
9326pinsrb m2, [r1 + 0], 1
9327pinsrb m2, [r1 + 4], 0
9328
9329pmaddubsw m3, m2, [r5 + 28 * 16]
9330pmulhrsw m3, m0
9331packuswb m3, m3
9332movd [r0 + 188], m3
9333
9334; mode 14
9335
9336pmaddubsw m3, m1, [r5 + 19 * 16]
9337pmulhrsw m3, m0
9338packuswb m3, m3
9339movd [r0 + 192], m3
9340
9341pmaddubsw m5, m1, [r5 + 6 * 16]
9342pmulhrsw m5, m0
9343packuswb m5, m5
9344movd [r0 + 196], m5
9345
9346pinsrb m2, [r1 + 2], 0
9347
9348pmaddubsw m3, m2, [r5 + 25 * 16]
9349pmulhrsw m3, m0
9350packuswb m3, m3
9351movd [r0 + 200], m3
9352
9353pmaddubsw m3, m2, [r5 + 12 * 16]
9354pmulhrsw m3, m0
9355packuswb m3, m3
9356movd [r0 + 204], m3
9357
9358; mode 15
9359
9360pmaddubsw m3, m1, [r5 + 15 * 16]
9361pmulhrsw m3, m0
9362packuswb m3, m3
9363movd [r0 + 208], m3
9364
9365pmaddubsw m3, m2, [r5 + 30 * 16]
9366pmulhrsw m3, m0
9367packuswb m3, m3
9368movd [r0 + 212], m3
9369
9370pmaddubsw m3, m2, [r5 + 13 * 16]
9371pmulhrsw m3, m0
9372packuswb m3, m3
9373movd [r0 + 216], m3
9374
9375pslldq m3, m2, 2
9376pinsrb m3, [r1 + 2], 1
9377pinsrb m3, [r1 + 4], 0
9378
9379pmaddubsw m4, m3, [r5 + 28 * 16]
9380pmulhrsw m4, m0
9381packuswb m4, m4
9382movd [r0 + 220], m4
9383
9384; mode 16
9385
9386pmaddubsw m4, m1, [r5 + 11 * 16]
9387pmulhrsw m4, m0
9388packuswb m4, m4
9389movd [r0 + 224], m4
9390
9391pmaddubsw m4, m2, [r5 + 22 * 16]
9392pmulhrsw m4, m0
9393packuswb m4, m4
9394movd [r0 + 228], m4
9395
9396pmaddubsw m4, m2, [r5 + 1 * 16]
9397pmulhrsw m4, m0
9398packuswb m4, m4
9399movd [r0 + 232], m4
9400
9401pinsrb m3, [r1 + 3], 0
9402
9403pmaddubsw m3, [r5 + 12 * 16]
9404pmulhrsw m3, m0
9405packuswb m3, m3
9406movd [r0 + 236], m3
9407
9408; mode 17
9409
9410movd [r0 + 240], m5
9411
9412pslldq m1, 2
9413pinsrb m1, [r1 + 1], 0
9414pinsrb m1, [r1 + 0], 1
9415
9416pmaddubsw m2, m1, [r5 + 12 * 16]
9417pmulhrsw m2, m0
9418packuswb m2, m2
9419movd [r0 + 244], m2
9420
9421pslldq m1, 2
9422pinsrb m1, [r1 + 2], 0
9423pinsrb m1, [r1 + 1], 1
9424
9425pmaddubsw m2, m1, [r5 + 18 * 16]
9426pmulhrsw m2, m0
9427packuswb m2, m2
9428movd [r0 + 248], m2
9429
9430pslldq m1, 2
9431pinsrb m1, [r1 + 4], 0
9432pinsrb m1, [r1 + 2], 1
9433
9434pmaddubsw m1, [r5 + 24 * 16]
9435pmulhrsw m1, m0
9436packuswb m1, m1
9437movd [r0 + 252], m1
9438
9439; mode 18
9440
9441movh m1, [r1]
9442movd [r0 + 256], m1
9443
9444pslldq m2, m1, 1
9445pinsrb m2, [r2 + 1], 0
9446movd [r0 + 260], m2
9447
9448pslldq m3, m2, 1
9449pinsrb m3, [r2 + 2], 0
9450movd [r0 + 264], m3
9451
9452pslldq m4, m3, 1
9453pinsrb m4, [r2 + 3], 0
9454movd [r0 + 268], m4
9455
9456; mode 19
9457
9458palignr m4, m1, 1
9459punpcklbw m1, m4
9460
9461pmaddubsw m5, m1, [r5 + 6 * 16]
9462pmulhrsw m5, m0
9463packuswb m5, m5
9464movd [r0 + 272], m5
9465
9466pslldq m2, m1, 2
9467pinsrb m2, [r2 + 1], 0
9468pinsrb m2, [r2], 1
9469
9470pmaddubsw m3, m2, [r5 + 12 * 16]
9471pmulhrsw m3, m0
9472packuswb m3, m3
9473movd [r0 + 276], m3
9474
9475pslldq m3, m2, 2
9476pinsrb m3, [r2 + 1], 1
9477pinsrb m3, [r2 + 2], 0
9478
9479pmaddubsw m4, m3, [r5 + 18 * 16]
9480pmulhrsw m4, m0
9481packuswb m4, m4
9482movd [r0 + 280], m4
9483
9484pslldq m3, 2
9485pinsrb m3, [r2 + 2], 1
9486pinsrb m3, [r2 + 4], 0
9487
9488pmaddubsw m3, [r5 + 24 * 16]
9489pmulhrsw m3, m0
9490packuswb m3, m3
9491movd [r0 + 284], m3
9492
9493; mode 20
9494
9495pmaddubsw m3, m1, [r5 + 11 * 16]
9496pmulhrsw m3, m0
9497packuswb m3, m3
9498movd [r0 + 288], m3
9499
9500pinsrb m2, [r2 + 2], 0
9501
9502pmaddubsw m3, m2, [r5 + 22 * 16]
9503pmulhrsw m3, m0
9504packuswb m3, m3
9505movd [r0 + 292], m3
9506
9507pmaddubsw m3, m2, [r5 + 1 * 16]
9508pmulhrsw m3, m0
9509packuswb m3, m3
9510movd [r0 + 296], m3
9511
9512pslldq m3, m2, 2
9513pinsrb m3, [r2 + 2], 1
9514pinsrb m3, [r2 + 3], 0
9515
9516pmaddubsw m4, m3, [r5 + 12 * 16]
9517pmulhrsw m4, m0
9518packuswb m4, m4
9519movd [r0 + 300], m4
9520
9521; mode 21
9522
9523pmaddubsw m4, m1, [r5 + 15 * 16]
9524pmulhrsw m4, m0
9525packuswb m4, m4
9526movd [r0 + 304], m4
9527
9528pmaddubsw m4, m2, [r5 + 30 * 16]
9529pmulhrsw m4, m0
9530packuswb m4, m4
9531movd [r0 + 308], m4
9532
9533pmaddubsw m4, m2, [r5 + 13 * 16]
9534pmulhrsw m4, m0
9535packuswb m4, m4
9536movd [r0 + 312], m4
9537
9538pinsrb m3, [r2 + 4], 0
9539
9540pmaddubsw m3, [r5 + 28 * 16]
9541pmulhrsw m3, m0
9542packuswb m3, m3
9543movd [r0 + 316], m3
9544
9545; mode 22
9546
9547pmaddubsw m3, m1, [r5 + 19 * 16]
9548pmulhrsw m3, m0
9549packuswb m3, m3
9550movd [r0 + 320], m3
9551
9552movd [r0 + 324], m5
9553
9554pmaddubsw m3, m2, [r5 + 25 * 16]
9555pmulhrsw m3, m0
9556packuswb m3, m3
9557movd [r0 + 328], m3
9558
9559pmaddubsw m3, m2, [r5 + 12 * 16]
9560pmulhrsw m3, m0
9561packuswb m3, m3
9562movd [r0 + 332], m3
9563
9564; mode 23
9565
9566pmaddubsw m3, m1, [r5 + 23 * 16]
9567pmulhrsw m3, m0
9568packuswb m3, m3
9569movd [r0 + 336], m3
9570
9571pmaddubsw m3, m1, [r5 + 14 * 16]
9572pmulhrsw m3, m0
9573packuswb m3, m3
9574movd [r0 + 340], m3
9575
9576pmaddubsw m3, m1, [r5 + 5 * 16]
9577pmulhrsw m3, m0
9578packuswb m3, m3
9579movd [r0 + 344], m3
9580
9581pinsrb m2, [r2 + 4], 0
9582
9583pmaddubsw m2, [r5 + 28 * 16]
9584pmulhrsw m2, m0
9585packuswb m2, m2
9586movd [r0 + 348], m2
9587
9588; mode 24
9589
9590pmaddubsw m2, m1, [r5 + 27 * 16]
9591pmulhrsw m2, m0
9592packuswb m2, m2
9593movd [r0 + 352], m2
9594
9595pmaddubsw m2, m1, [r5 + 22 * 16]
9596pmulhrsw m2, m0
9597packuswb m2, m2
9598movd [r0 + 356], m2
9599
9600pmaddubsw m2, m1, [r5 + 17 * 16]
9601pmulhrsw m2, m0
9602packuswb m2, m2
9603movd [r0 + 360], m2
9604
9605pmaddubsw m2, m1, [r5 + 12 * 16]
9606pmulhrsw m2, m0
9607packuswb m2, m2
9608movd [r0 + 364], m2
9609
9610; mode 25
9611
9612pmaddubsw m2, m1, [r5 + 30 * 16]
9613pmulhrsw m2, m0
9614packuswb m2, m2
9615movd [r0 + 368], m2
9616
9617pmaddubsw m2, m1, [r5 + 28 * 16]
9618pmulhrsw m2, m0
9619packuswb m2, m2
9620movd [r0 + 372], m2
9621
9622pmaddubsw m2, m1, [r5 + 26 * 16]
9623pmulhrsw m2, m0
9624packuswb m2, m2
9625movd [r0 + 376], m2
9626
9627pmaddubsw m2, m1, [r5 + 24 * 16]
9628pmulhrsw m2, m0
9629packuswb m2, m2
9630movd [r0 + 380], m2
9631
9632; mode 26
9633
9634movh m1, [r1 + 1]
9635pshufd m2, m1, 0
9636movu [r0 + 384], m2
9637
9638pxor m2, m2
9639
9640pshufb m3, m1, m2
9641punpcklbw m3, m2
9642
9643movh m4, [r2]
9644
9645pshufb m5, m4, m2
9646punpcklbw m5, m2
9647
9648psrldq m4, 1
9649punpcklbw m4, m2
9650
9651psubw m4, m5
9652psraw m4, 1
9653
9654paddw m3, m4
9655
9656packuswb m3, m2
9657
9658pextrb [r0 + 384], m3, 0
9659pextrb [r0 + 388], m3, 1
9660pextrb [r0 + 392], m3, 2
9661pextrb [r0 + 396], m3, 3
9662
9663; mode 27
9664
9665palignr m2, m1, 1
9666punpcklbw m1, m2
9667
9668pmaddubsw m2, m1, [r5 + 2 * 16]
9669pmulhrsw m2, m0
9670packuswb m2, m2
9671movd [r0 + 400], m2
9672
9673pmaddubsw m2, m1, [r5 + 4 * 16]
9674pmulhrsw m2, m0
9675packuswb m2, m2
9676movd [r0 + 404], m2
9677
9678pmaddubsw m2, m1, [r5 + 6 * 16]
9679pmulhrsw m2, m0
9680packuswb m2, m2
9681movd [r0 + 408], m2
9682
9683pmaddubsw m2, m1, [r5 + 8 * 16]
9684pmulhrsw m2, m0
9685packuswb m2, m2
9686movd [r0 + 412], m2
9687
9688; mode 28
9689
9690pmaddubsw m2, m1, [r5 + 5 * 16]
9691pmulhrsw m2, m0
9692packuswb m2, m2
9693movd [r0 + 416], m2
9694
9695pmaddubsw m2, m1, [r5 + 10 * 16]
9696pmulhrsw m2, m0
9697packuswb m2, m2
9698movd [r0 + 420], m2
9699
9700pmaddubsw m2, m1, [r5 + 15 * 16]
9701pmulhrsw m2, m0
9702packuswb m2, m2
9703movd [r0 + 424], m2
9704
9705pmaddubsw m2, m1, m7
9706pmulhrsw m2, m0
9707packuswb m2, m2
9708movd [r0 + 428], m2
9709
9710; mode 29
9711
9712pmaddubsw m2, m1, [r5 + 9 * 16]
9713pmulhrsw m2, m0
9714packuswb m2, m2
9715movd [r0 + 432], m2
9716
9717pmaddubsw m2, m1, [r5 + 18 * 16]
9718pmulhrsw m2, m0
9719packuswb m2, m2
9720movd [r0 + 436], m2
9721
9722pmaddubsw m2, m1, [r5 + 27 * 16]
9723pmulhrsw m2, m0
9724packuswb m2, m2
9725movd [r0 + 440], m2
9726
9727palignr m2, m1, 2
9728
9729pmaddubsw m3, m2, [r5 + 4 * 16]
9730pmulhrsw m3, m0
9731packuswb m3, m3
9732movd [r0 + 444], m3
9733
9734; mode 30
9735
9736pmaddubsw m3, m1, [r5 + 13 * 16]
9737pmulhrsw m3, m0
9738packuswb m3, m3
9739movd [r0 + 448], m3
9740
9741pmaddubsw m6, m1, [r5 + 26 * 16]
9742pmulhrsw m6, m0
9743packuswb m6, m6
9744movd [r0 + 452], m6
9745
9746pmaddubsw m3, m2, [r5 + 7 * 16]
9747pmulhrsw m3, m0
9748packuswb m3, m3
9749movd [r0 + 456], m3
9750
9751pmaddubsw m5, m2, m7
9752pmulhrsw m5, m0
9753packuswb m5, m5
9754movd [r0 + 460], m5
9755
9756; mode 31
9757
9758pmaddubsw m3, m1, [r5 + 17 * 16]
9759pmulhrsw m3, m0
9760packuswb m3, m3
9761movd [r0 + 464], m3
9762
9763pmaddubsw m3, m2, [r5 + 2 * 16]
9764pmulhrsw m3, m0
9765packuswb m3, m3
9766movd [r0 + 468], m3
9767
9768pmaddubsw m3, m2, [r5 + 19 * 16]
9769pmulhrsw m3, m0
9770packuswb m3, m3
9771movd [r0 + 472], m3
9772
9773palignr m3, m2, 2
9774
9775pmaddubsw m4, m3, [r5 + 4 * 16]
9776pmulhrsw m4, m0
9777packuswb m4, m4
9778movd [r0 + 476], m4
9779
9780; mode 32
9781
9782pmaddubsw m4, m1, [r5 + 21 * 16]
9783pmulhrsw m4, m0
9784packuswb m4, m4
9785movd [r0 + 480], m4
9786
9787pmaddubsw m4, m2, [r5 + 10 * 16]
9788pmulhrsw m4, m0
9789packuswb m4, m4
9790movd [r0 + 484], m4
9791
9792pmaddubsw m4, m2, [r5 + 31 * 16]
9793pmulhrsw m4, m0
9794packuswb m4, m4
9795movd [r0 + 488], m4
9796
9797pmaddubsw m4, m3, m7
9798pmulhrsw m4, m0
9799packuswb m4, m4
9800movd [r0 + 492], m4
9801
9802; mode 33
9803
9804movd [r0 + 496], m6
9805
9806movd [r0 + 500], m5
9807
9808pmaddubsw m4, m3, [r5 + 14 * 16]
9809pmulhrsw m4, m0
9810packuswb m4, m4
9811movd [r0 + 504], m4
9812
9813psrldq m3, 2
9814
9815pmaddubsw m3, [r5 + 8 * 16]
9816pmulhrsw m3, m0
9817packuswb m3, m3
9818movd [r0 + 508], m3
9819
9820; mode 34
9821
9822movh m0, [r1 + 2]
9823movd [r0 + 512], m0
9824
9825palignr m1, m0, 1
9826movd [r0 + 516], m1
9827
9828palignr m1, m0, 2
9829movd [r0 + 520], m1
9830
9831palignr m1, m0, 3
9832movd [r0 + 524], m1
9833
9834RET
9835
9836;-----------------------------------------------------------------------------
9837; void all_angs_pred_8x8(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
9838;-----------------------------------------------------------------------------
9839INIT_XMM sse4
9840cglobal all_angs_pred_8x8, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
9841
9842; mode 2
9843
9844movu m0, [r4 + 2]
9845
9846palignr m1, m0, 1
9847punpcklqdq m2, m0, m1
9848movu [r0], m2
9849
9850palignr m1, m0, 2
9851palignr m2, m0, 3
9852punpcklqdq m1, m2
9853movu [r0 + 16], m1
9854
9855palignr m1, m0, 4
9856palignr m2, m0, 5
9857punpcklqdq m1, m2
9858movu [r0 + 32], m1
9859
9860palignr m1, m0, 6
9861palignr m2, m0, 7
9862punpcklqdq m1, m2
9863movu [r0 + 48], m1
9864
9865; mode 3 [row 0, 1]
9866
9867mova m7, [pw_1024]
9868lea r5, [ang_table]
9869
9870movu m0, [r2 + 1]
9871
9872palignr m1, m0, 1
9873palignr m2, m0, 2
9874
9875punpcklbw m3, m0, m1
9876pmaddubsw m4, m3, [r5 + 26 * 16]
9877pmulhrsw m4, m7
9878
9879punpcklbw m1, m2
9880pmaddubsw m5, m1, [r5 + 20 * 16]
9881pmulhrsw m5, m7
9882
9883packuswb m4, m5
9884
9885movu [r0 + 64], m4
9886
9887; mode 6 [row 1]
9888
9889movh [r0 + 264], m4
9890
9891; mode 6 [row 3]
9892
9893movhps [r0 + 280], m4
9894
9895; mode 4 [row 0, 1]
9896
9897pmaddubsw m4, m3, [r5 + 21 * 16]
9898pmulhrsw m4, m7
9899
9900pmaddubsw m5, m1, [r5 + 10 * 16]
9901pmulhrsw m5, m7
9902
9903packuswb m4, m5
9904movu [r0 + 128], m4
9905
9906; mode 5 [row 0, 1]
9907
9908pmaddubsw m4, m3, [r5 + 17 * 16]
9909pmulhrsw m4, m7
9910
9911pmaddubsw m5, m1, [r5 + 2 * 16]
9912pmulhrsw m5, m7
9913
9914packuswb m4, m5
9915movu [r0 + 192], m4
9916
9917; mode 6 [row 0]
9918
9919pmaddubsw m4, m3, [r5 + 13 * 16]
9920pmulhrsw m4, m7
9921
9922pxor m5, m5
9923
9924packuswb m4, m5
9925movh [r0 + 256], m4
9926
9927; mode 7 [row 0, 1]
9928
9929pmaddubsw m4, m3, [r5 + 9 * 16]
9930pmulhrsw m4, m7
9931
9932pmaddubsw m5, m3, [r5 + 18 * 16]
9933pmulhrsw m5, m7
9934
9935packuswb m4, m5
9936movu [r0 + 320], m4
9937
9938; mode 8 [row 0, 1]
9939
9940pmaddubsw m4, m3, [r5 + 5 * 16]
9941pmulhrsw m4, m7
9942
9943pmaddubsw m5, m3, [r5 + 10 * 16]
9944pmulhrsw m5, m7
9945
9946packuswb m4, m5
9947movu [r0 + 384], m4
9948
9949; mode 8 [row 2, 3]
9950
9951pmaddubsw m4, m3, [r5 + 15 * 16]
9952pmulhrsw m4, m7
9953
9954pmaddubsw m5, m3, [r5 + 20 * 16]
9955pmulhrsw m5, m7
9956
9957packuswb m4, m5
9958movu [r0 + 400], m4
9959
9960; mode 8 [row 4, 5]
9961
9962pmaddubsw m4, m3, [r5 + 25 * 16]
9963pmulhrsw m4, m7
9964
9965pmaddubsw m5, m3, [r5 + 30 * 16]
9966pmulhrsw m5, m7
9967
9968packuswb m4, m5
9969movu [r0 + 416], m4
9970
9971; mode 8 [row 6, 7]
9972
9973pmaddubsw m4, m1, [r5 + 3 * 16]
9974pmulhrsw m4, m7
9975
9976pmaddubsw m5, m1, [r5 + 8 * 16]
9977pmulhrsw m5, m7
9978
9979packuswb m4, m5
9980movu [r0 + 432], m4
9981
9982; mode 9 [row 0, 1]
9983
9984pmaddubsw m4, m3, [r5 + 2 * 16]
9985pmulhrsw m4, m7
9986
9987pmaddubsw m5, m3, [r5 + 4 * 16]
9988pmulhrsw m5, m7
9989
9990packuswb m4, m5
9991movu [r0 + 448], m4
9992
9993; mode 9 [row 2, 3]
9994
9995pmaddubsw m4, m3, [r5 + 6 * 16]
9996pmulhrsw m4, m7
9997
9998pmaddubsw m5, m3, [r5 + 8 * 16]
9999pmulhrsw m5, m7
10000
10001packuswb m4, m5
10002movu [r0 + 464], m4
10003
10004; mode 9 [row 4, 5]
10005
10006pmaddubsw m4, m3, [r5 + 10 * 16]
10007pmulhrsw m4, m7
10008
10009pmaddubsw m5, m3, [r5 + 12 * 16]
10010pmulhrsw m5, m7
10011
10012packuswb m4, m5
10013movu [r0 + 480], m4
10014
10015; mode 9 [row 6, 7]
10016
10017pmaddubsw m4, m3, [r5 + 14 * 16]
10018pmulhrsw m4, m7
10019
10020pmaddubsw m5, m3, [r5 + 16 * 16]
10021pmulhrsw m5, m7
10022
10023packuswb m4, m5
10024movu [r0 + 496], m4
10025
10026; mode 7 [row 2, 3]
10027
10028pmaddubsw m4, m3, [r5 + 27 * 16]
10029pmulhrsw m4, m7
10030
10031pmaddubsw m5, m1, [r5 + 4 * 16]
10032pmulhrsw m5, m7
10033
10034packuswb m4, m5
10035movu [r0 + 336], m4
10036
10037; mode 7 [row 4, 5]
10038
10039pmaddubsw m4, m1, [r5 + 13 * 16]
10040pmulhrsw m4, m7
10041
10042pmaddubsw m5, m1, [r5 + 22 * 16]
10043pmulhrsw m5, m7
10044
10045packuswb m4, m5
10046movu [r0 + 352], m4
10047
10048; mode 6 [row 2]
10049
10050pmaddubsw m4, m1, [r5 + 7 * 16]
10051pmulhrsw m4, m7
10052
10053pxor m5, m5
10054
10055packuswb m4, m5
10056movh [r0 + 272], m4
10057
10058; mode 3 [row 2, 3]
10059
10060palignr m1, m0, 3
10061palignr m3, m0, 4
10062
10063punpcklbw m2, m1
10064pmaddubsw m5, m2, [r5 + 14 * 16]
10065pmulhrsw m5, m7
10066
10067punpcklbw m1, m3
10068pmaddubsw m6, m1, [r5 + 8 * 16]
10069pmulhrsw m6, m7
10070
10071packuswb m5, m6
10072movu [r0 + 80], m5
10073
10074; mode 6 [row 7]
10075
10076movhps [r0 + 312], m5
10077
10078; mode 6 [row 5]
10079
10080movh [r0 + 296], m5
10081
10082; mode 4 [calculate and store row 4, 5]
10083
10084pmaddubsw m4, m1, [r5 + 9 * 16]
10085pmulhrsw m4, m7
10086
10087pmaddubsw m5, m1, [r5 + 30 * 16]
10088pmulhrsw m5, m7
10089
10090packuswb m4, m5
10091movu [r0 + 160], m4
10092
10093; mode 5 [row 4, 5]
10094
10095pmaddubsw m4, m2, [r5 + 21 * 16]
10096pmulhrsw m4, m7
10097
10098pmaddubsw m5, m1, [r5 + 6 * 16]
10099pmulhrsw m5, m7
10100
10101packuswb m4, m5
10102movu [r0 + 224], m4
10103
10104; mode 6 [row 4, 5]
10105
10106pmaddubsw m5, m2, [r5 + 1 * 16]
10107pmulhrsw m5, m7
10108
10109pxor m6, m6
10110
10111packuswb m5, m6
10112movh [r0 + 288], m5
10113
10114; mode 6 [row 6, 7]
10115
10116pmaddubsw m5, m2, [r5 + 27 * 16]
10117pmulhrsw m5, m7
10118
10119pxor m6, m6
10120
10121packuswb m5, m6
10122movh [r0 + 304], m5
10123
10124; mode 5 [calculate row 6]
10125
10126pmaddubsw m6, m1, [r5 + 23 * 16]
10127pmulhrsw m6, m7
10128
10129; mode 3 [row 4, 5]
10130
10131palignr m1, m0, 5
10132
10133punpcklbw m3, m1
10134pmaddubsw m4, m3, [r5 + 2 * 16]
10135pmulhrsw m4, m7
10136
10137pmaddubsw m5, m3, [r5 + 28 * 16]
10138pmulhrsw m5, m7
10139
10140packuswb m4, m5
10141movu [r0 + 96], m4
10142
10143; mode 4 [calculate row 7]
10144
10145pmaddubsw m5, m3, [r5 + 19 * 16]
10146pmulhrsw m5, m7
10147
10148; mode 5 [calculate row 6]
10149
10150pmaddubsw m4, m3, [r5 + 8 * 16]
10151pmulhrsw m4, m7
10152
10153packuswb m6, m4
10154movu [r0 + 240], m6
10155
10156; mode 3 [row 6, 7]
10157
10158palignr m2, m0, 6
10159palignr m3, m0, 7
10160
10161punpcklbw m1, m2
10162pmaddubsw m4, m1, [r5 + 22 * 16]
10163pmulhrsw m4, m7
10164
10165punpcklbw m2, m3
10166pmaddubsw m2, [r5 + 16 * 16]
10167pmulhrsw m2, m7
10168
10169packuswb m4, m2
10170movu [r0 + 112], m4
10171
10172; mode 4 [calculate row 7]
10173
10174pmaddubsw m2, m1, [r5 + 8 * 16]
10175pmulhrsw m2, m7
10176
10177; mode 4 [store row 6 and 7]
10178
10179packuswb m5, m2
10180movu [r0 + 176], m5
10181
10182; mode 4 [row 2, 3]
10183
10184palignr m1, m0, 1
10185palignr m2, m0, 2
10186palignr m3, m0, 3
10187
10188punpcklbw m1, m2
10189pmaddubsw m4, m1, [r5 + 31 * 16]
10190pmulhrsw m4, m7
10191
10192punpcklbw m2, m3
10193pmaddubsw m5, m2, [r5 + 20 * 16]
10194pmulhrsw m5, m7
10195
10196packuswb m4, m5
10197movu [r0 + 144], m4
10198
10199; mode 5 [row 2, 3]
10200
10201pmaddubsw m4, m1, [r5 + 19 * 16]
10202pmulhrsw m4, m7
10203
10204pmaddubsw m5, m2, [r5 + 4 * 16]
10205pmulhrsw m5, m7
10206
10207packuswb m4, m5
10208movu [r0 + 208], m4
10209
10210; mode 7 [row 6, 7]
10211
10212pmaddubsw m4, m1, [r5 + 31 * 16]
10213pmulhrsw m4, m7
10214
10215pmaddubsw m5, m2, [r5 + 8 * 16]
10216pmulhrsw m5, m7
10217
10218packuswb m4, m5
10219movu [r0 + 368], m4
10220
10221; mode 10
10222
10223pshufb m1, m0, [tab_Si]
10224movu [r0 + 512], m1
10225movu [r0 + 528], m1
10226movu [r0 + 544], m1
10227movu [r0 + 560], m1
10228
10229pxor m0, m0
10230
10231pshufb m1, m1, m0
10232punpcklbw m1, m0
10233
10234movu m2, [r1]
10235
10236pshufb m3, m2, m0
10237punpcklbw m3, m0
10238
10239psrldq m4, m2, 1
10240punpcklbw m4, m0
10241
10242movu m2, [r1 + 9]
10243punpcklbw m2, m0
10244
10245psubw m4, m3
10246psubw m2, m3
10247
10248psraw m4, 1
10249psraw m2, 1
10250
10251paddw m4, m1
10252paddw m2, m1
10253
10254packuswb m4, m2
10255
10256pextrb [r0 + 512], m4, 0
10257pextrb [r0 + 520], m4, 1
10258pextrb [r0 + 528], m4, 2
10259pextrb [r0 + 536], m4, 3
10260pextrb [r0 + 544], m4, 4
10261pextrb [r0 + 552], m4, 5
10262pextrb [r0 + 560], m4, 6
10263pextrb [r0 + 568], m4, 7
10264
10265; mode 11 [row 0, 1]
10266
10267movu m0, [r2]
10268palignr m1, m0, 1
10269punpcklbw m2, m0, m1
10270
10271pmaddubsw m3, m2, [r5 + 30 * 16]
10272pmulhrsw m3, m7
10273
10274pmaddubsw m4, m2, [r5 + 28 * 16]
10275pmulhrsw m4, m7
10276
10277packuswb m3, m4
10278movu [r0 + 576], m3
10279
10280; mode 11 [row 2, 3]
10281
10282pmaddubsw m3, m2, [r5 + 26 * 16]
10283pmulhrsw m3, m7
10284
10285pmaddubsw m4, m2, [r5 + 24 * 16]
10286pmulhrsw m4, m7
10287
10288packuswb m3, m4
10289movu [r0 + 592], m3
10290
10291; mode 11 [row 4, 5]
10292
10293pmaddubsw m3, m2, [r5 + 22 * 16]
10294pmulhrsw m3, m7
10295
10296pmaddubsw m4, m2, [r5 + 20 * 16]
10297pmulhrsw m4, m7
10298
10299packuswb m5, m3, m4
10300movu [r0 + 608], m5
10301
10302; mode 12 [row 0, 1]
10303
10304pmaddubsw m4, m2, [r5 + 27 * 16]
10305pmulhrsw m4, m7
10306
10307packuswb m4, m3
10308movu [r0 + 640], m4
10309
10310; mode 11 [row 6, 7]
10311
10312pmaddubsw m3, m2, [r5 + 18 * 16]
10313pmulhrsw m3, m7
10314
10315pmaddubsw m4, m2, [r5 + 16 * 16]
10316pmulhrsw m4, m7
10317
10318packuswb m3, m4
10319movu [r0 + 624], m3
10320
10321; mode 12 [row 2, 3]
10322
10323pmaddubsw m3, m2, [r5 + 17 * 16]
10324pmulhrsw m3, m7
10325
10326pmaddubsw m4, m2, [r5 + 12 * 16]
10327pmulhrsw m4, m7
10328
10329packuswb m3, m4
10330movu [r0 + 656], m3
10331
10332; mode 12 [row 4, 5]
10333
10334pmaddubsw m3, m2, [r5 + 7 * 16]
10335pmulhrsw m3, m7
10336
10337pmaddubsw m4, m2, [r5 + 2 * 16]
10338pmulhrsw m4, m7
10339
10340packuswb m3, m4
10341movu [r0 + 672], m3
10342
10343; mode 12 [row 6, 7]
10344
10345pslldq m3, m2, 2
10346pinsrb m3, [r1 + 0], 1
10347pinsrb m3, [r1 + 6], 0
10348
10349pmaddubsw m4, m3, [r5 + 29 * 16]
10350pmulhrsw m4, m7
10351
10352pmaddubsw m5, m3, [r5 + 24 * 16]
10353pmulhrsw m5, m7
10354
10355packuswb m4, m5
10356movu [r0 + 688], m4
10357
10358; mode 13 [row 0, 1]
10359
10360pmaddubsw m4, m2, [r5 + 23 * 16]
10361pmulhrsw m4, m7
10362
10363pmaddubsw m5, m2, [r5 + 14 * 16]
10364pmulhrsw m5, m7
10365
10366packuswb m4, m5
10367movu [r0 + 704], m4
10368
10369; mode 13 [row 2, 3]
10370
10371pmaddubsw m4, m2, [r5 + 5 * 16]
10372pmulhrsw m4, m7
10373
10374pinsrb m3, [r1 + 4], 0
10375pmaddubsw m5, m3, [r5 + 28 * 16]
10376pmulhrsw m5, m7
10377
10378packuswb m4, m5
10379movu [r0 + 720], m4
10380
10381; mode 13 [row 4, 5]
10382
10383pmaddubsw m4, m3, [r5 + 19 * 16]
10384pmulhrsw m4, m7
10385
10386pmaddubsw m5, m3, [r5 + 10 * 16]
10387pmulhrsw m5, m7
10388
10389packuswb m4, m5
10390movu [r0 + 736], m4
10391
10392; mode 13 [row 6, 7]
10393
10394pmaddubsw m4, m3, [r5 + 1 * 16]
10395pmulhrsw m4, m7
10396
10397pslldq m5, m3, 2
10398pinsrb m5, [r1 + 4], 1
10399pinsrb m5, [r1 + 7], 0
10400
10401pmaddubsw m5, [r5 + 24 * 16]
10402pmulhrsw m5, m7
10403
10404packuswb m4, m5
10405movu [r0 + 752], m4
10406
10407; mode 14 [row 0, 1]
10408
10409pmaddubsw m4, m2, [r5 + 19 * 16]
10410pmulhrsw m4, m7
10411
10412pmaddubsw m5, m2, [r5 + 6 * 16]
10413pmulhrsw m5, m7
10414
10415packuswb m4, m5
10416movu [r0 + 768], m4
10417
10418; mode 14 [row 2, 3]
10419
10420pinsrb m3, [r1 + 2], 0
10421
10422pmaddubsw m4, m3, [r5 + 25 * 16]
10423pmulhrsw m4, m7
10424
10425pmaddubsw m5, m3, [r5 + 12 * 16]
10426pmulhrsw m5, m7
10427
10428packuswb m4, m5
10429movu [r0 + 784], m4
10430
10431; mode 14 [row 4, 5]
10432
10433pslldq m1, m3, 2
10434pinsrb m1, [r1 + 2], 1
10435pinsrb m1, [r1 + 5], 0
10436
10437pmaddubsw m4, m1, [r5 + 31 * 16]
10438pmulhrsw m4, m7
10439
10440pmaddubsw m5, m1, [r5 + 18 * 16]
10441pmulhrsw m5, m7
10442
10443packuswb m4, m5
10444movu [r0 + 800], m4
10445
10446; mode 14 [row 6, 7]
10447
10448pmaddubsw m4, m1, [r5 + 5 * 16]
10449pmulhrsw m4, m7
10450
10451pslldq m1, 2
10452pinsrb m1, [r1 + 5], 1
10453pinsrb m1, [r1 + 7], 0
10454
10455pmaddubsw m5, m1, [r5 + 24 * 16]
10456pmulhrsw m5, m7
10457
10458packuswb m4, m5
10459movu [r0 + 816], m4
10460
10461; mode 15 [row 0, 1]
10462
10463pmaddubsw m4, m2, [r5 + 15 * 16]
10464pmulhrsw m4, m7
10465
10466pmaddubsw m5, m3, [r5 + 30 * 16]
10467pmulhrsw m5, m7
10468
10469packuswb m4, m5
10470movu [r0 + 832], m4
10471
10472; mode 15 [row 2, 3]
10473
10474pmaddubsw m4, m3, [r5 + 13 * 16]
10475pmulhrsw m4, m7
10476
10477pslldq m1, m3, 2
10478pinsrb m1, [r1 + 2], 1
10479pinsrb m1, [r1 + 4], 0
10480
10481pmaddubsw m5, m1, [r5 + 28 * 16]
10482pmulhrsw m5, m7
10483
10484packuswb m4, m5
10485movu [r0 + 848], m4
10486
10487; mode 15 [row 4, 5]
10488
10489pmaddubsw m4, m1, [r5 + 11 * 16]
10490pmulhrsw m4, m7
10491
10492pslldq m1, 2
10493pinsrb m1, [r1 + 4], 1
10494pinsrb m1, [r1 + 6], 0
10495
10496pmaddubsw m5, m1, [r5 + 26 * 16]
10497pmulhrsw m5, m7
10498
10499packuswb m4, m5
10500movu [r0 + 864], m4
10501
10502; mode 15 [row 6, 7]
10503
10504pmaddubsw m4, m1, [r5 + 9 * 16]
10505pmulhrsw m4, m7
10506
10507pslldq m1, 2
10508pinsrb m1, [r1 + 6], 1
10509pinsrb m1, [r1 + 8], 0
10510
10511pmaddubsw m1, [r5 + 24 * 16]
10512pmulhrsw m1, m7
10513
10514packuswb m4, m1
10515movu [r0 + 880], m4
10516
10517; mode 16 [row 0, 1]
10518
10519pmaddubsw m4, m2, [r5 + 11 * 16]
10520pmulhrsw m4, m7
10521
10522pmaddubsw m5, m3, [r5 + 22 * 16]
10523pmulhrsw m5, m7
10524
10525packuswb m4, m5
10526movu [r0 + 896], m4
10527
10528; mode 16 [row 2, 3]
10529
10530pmaddubsw m4, m3, [r5 + 1 * 16]
10531pmulhrsw m4, m7
10532
10533pslldq m3, 2
10534pinsrb m3, [r1 + 2], 1
10535pinsrb m3, [r1 + 3], 0
10536
10537pmaddubsw m5, m3, [r5 + 12 * 16]
10538pmulhrsw m5, m7
10539
10540packuswb m4, m5
10541movu [r0 + 912], m4
10542
10543; mode 16 [row 4, 5]
10544
10545pslldq m3, 2
10546pinsrb m3, [r1 + 3], 1
10547pinsrb m3, [r1 + 5], 0
10548
10549pmaddubsw m4, m3, [r5 + 23 * 16]
10550pmulhrsw m4, m7
10551
10552pmaddubsw m5, m3, [r5 + 2 * 16]
10553pmulhrsw m5, m7
10554
10555packuswb m4, m5
10556movu [r0 + 928], m4
10557
10558; mode 16 [row 6, 7]
10559
10560pslldq m3, 2
10561pinsrb m3, [r1 + 5], 1
10562pinsrb m3, [r1 + 6], 0
10563
10564pmaddubsw m4, m3, [r5 + 13 * 16]
10565pmulhrsw m4, m7
10566
10567pslldq m3, 2
10568pinsrb m3, [r1 + 6], 1
10569pinsrb m3, [r1 + 8], 0
10570
10571pmaddubsw m3, [r5 + 24 * 16]
10572pmulhrsw m3, m7
10573
10574packuswb m4, m3
10575movu [r0 + 944], m4
10576
10577; mode 17 [row 0, 1]
10578
10579pmaddubsw m4, m2, [r5 + 6 * 16]
10580pmulhrsw m4, m7
10581
10582pslldq m2, 2
10583pinsrb m2, [r1 + 0], 1
10584pinsrb m2, [r1 + 1], 0
10585
10586pmaddubsw m3, m2, [r5 + 12 * 16]
10587pmulhrsw m3, m7
10588
10589packuswb m4, m3
10590movu [r0 + 960], m4
10591
10592; mode 17 [row 2, 3]
10593
10594pslldq m2, 2
10595pinsrb m2, [r1 + 1], 1
10596pinsrb m2, [r1 + 2], 0
10597
10598pmaddubsw m4, m2, [r5 + 18 * 16]
10599pmulhrsw m4, m7
10600
10601pslldq m2, 2
10602pinsrb m2, [r1 + 2], 1
10603pinsrb m2, [r1 + 4], 0
10604
10605pmaddubsw m3, m2, [r5 + 24 * 16]
10606pmulhrsw m3, m7
10607
10608packuswb m4, m3
10609movu [r0 + 976], m4
10610
10611; mode 17 [row 4, 5]
10612
10613pslldq m2, 2
10614pinsrb m2, [r1 + 4], 1
10615pinsrb m2, [r1 + 5], 0
10616
10617pmaddubsw m4, m2, [r5 + 30 * 16]
10618pmulhrsw m4, m7
10619
10620pmaddubsw m3, m2, [r5 + 4 * 16]
10621pmulhrsw m3, m7
10622
10623packuswb m4, m3
10624movu [r0 + 992], m4
10625
10626; mode 17 [row 6, 7]
10627
10628pslldq m2, 2
10629pinsrb m2, [r1 + 5], 1
10630pinsrb m2, [r1 + 6], 0
10631
10632pmaddubsw m4, m2, [r5 + 10 * 16]
10633pmulhrsw m4, m7
10634
10635pslldq m2, 2
10636pinsrb m2, [r1 + 6], 1
10637pinsrb m2, [r1 + 7], 0
10638
10639pmaddubsw m3, m2, [r5 + 16 * 16]
10640pmulhrsw m3, m7
10641
10642packuswb m4, m3
10643movu [r0 + 1008], m4
10644
10645; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7]
10646
10647movh m1, [r3]
10648movh [r0 + 1024], m1
10649
10650pslldq m2, m1, 1
10651pinsrb m2, [r4 + 1], 0
10652movh [r0 + 1032], m2
10653
10654pslldq m2, 1
10655pinsrb m2, [r4 + 2], 0
10656movh [r0 + 1040], m2
10657
10658pslldq m2, 1
10659pinsrb m2, [r4 + 3], 0
10660movh [r0 + 1048], m2
10661
10662pslldq m2, 1
10663pinsrb m2, [r4 + 4], 0
10664movh [r0 + 1056], m2
10665
10666pslldq m2, 1
10667pinsrb m2, [r4 + 5], 0
10668movh [r0 + 1064], m2
10669
10670pslldq m2, 1
10671pinsrb m2, [r4 + 6], 0
10672movh [r0 + 1072], m2
10673
10674pslldq m2, 1
10675pinsrb m2, [r4 + 7], 0
10676movh [r0 + 1080], m2
10677
10678; mode 19 [row 0, 1]
10679
10680movu m0, [r1]
10681palignr m1, m0, 1
10682punpcklbw m0, m1
10683
10684pmaddubsw m1, m0, [r5 + 6 * 16]
10685pmulhrsw m1, m7
10686
10687pslldq m2, m0, 2
10688pinsrb m2, [r2 + 0], 1
10689pinsrb m2, [r2 + 1], 0
10690
10691pmaddubsw m3, m2, [r5 + 12 * 16]
10692pmulhrsw m3, m7
10693
10694packuswb m1, m3
10695movu [r0 + 1088], m1
10696
10697; mode 19 [row 2, 3]
10698
10699pslldq m2, 2
10700pinsrb m2, [r2 + 1], 1
10701pinsrb m2, [r2 + 2], 0
10702
10703pmaddubsw m4, m2, [r5 + 18 * 16]
10704pmulhrsw m4, m7
10705
10706pslldq m2, 2
10707pinsrb m2, [r2 + 2], 1
10708pinsrb m2, [r2 + 4], 0
10709
10710pmaddubsw m5, m2, [r5 + 24 * 16]
10711pmulhrsw m5, m7
10712
10713packuswb m4, m5
10714movu [r0 + 1104], m4
10715
10716; mode 19 [row 4, 5]
10717
10718pslldq m2, 2
10719pinsrb m2, [r2 + 4], 1
10720pinsrb m2, [r2 + 5], 0
10721
10722pmaddubsw m4, m2, [r5 + 30 * 16]
10723pmulhrsw m4, m7
10724
10725pmaddubsw m5, m2, [r5 + 4 * 16]
10726pmulhrsw m5, m7
10727
10728packuswb m4, m5
10729movu [r0 + 1120], m4
10730
10731; mode 19 [row 6, 7]
10732
10733pslldq m2, 2
10734pinsrb m2, [r2 + 5], 1
10735pinsrb m2, [r2 + 6], 0
10736
10737pmaddubsw m4, m2, [r5 + 10 * 16]
10738pmulhrsw m4, m7
10739
10740pslldq m2, 2
10741pinsrb m2, [r2 + 6], 1
10742pinsrb m2, [r2 + 7], 0
10743
10744pmaddubsw m2, [r5 + 16 * 16]
10745pmulhrsw m2, m7
10746
10747packuswb m4, m2
10748movu [r0 + 1136], m4
10749
10750; mode 20 [row 0, 1]
10751
10752pmaddubsw m3, m0, [r5 + 11 * 16]
10753pmulhrsw m3, m7
10754
10755pslldq m1, m0, 2
10756pinsrb m1, [r2 + 0], 1
10757pinsrb m1, [r2 + 2], 0
10758
10759pmaddubsw m4, m1, [r5 + 22 * 16]
10760pmulhrsw m4, m7
10761
10762packuswb m3, m4
10763movu [r0 + 1152], m3
10764
10765; mode 20 [row 2, 3]
10766
10767pmaddubsw m3, m1, [r5 + 1 * 16]
10768pmulhrsw m3, m7
10769
10770pslldq m2, m1, 2
10771pinsrb m2, [r2 + 2], 1
10772pinsrb m2, [r2 + 3], 0
10773
10774pmaddubsw m4, m2, [r5 + 12 * 16]
10775pmulhrsw m4, m7
10776
10777packuswb m3, m4
10778movu [r0 + 1168], m3
10779
10780; mode 20 [row 4, 5]
10781
10782pslldq m2, 2
10783pinsrb m2, [r2 + 3], 1
10784pinsrb m2, [r2 + 5], 0
10785
10786pmaddubsw m3, m2, [r5 + 23 * 16]
10787pmulhrsw m3, m7
10788
10789pmaddubsw m4, m2, [r5 + 2 * 16]
10790pmulhrsw m4, m7
10791
10792packuswb m3, m4
10793movu [r0 + 1184], m3
10794
10795; mode 20 [row 6, 7]
10796
10797pslldq m2, 2
10798pinsrb m2, [r2 + 5], 1
10799pinsrb m2, [r2 + 6], 0
10800
10801pmaddubsw m3, m2, [r5 + 13 * 16]
10802pmulhrsw m3, m7
10803
10804pslldq m2, 2
10805pinsrb m2, [r2 + 6], 1
10806pinsrb m2, [r2 + 8], 0
10807
10808pmaddubsw m4, m2, [r5 + 24 * 16]
10809pmulhrsw m4, m7
10810
10811packuswb m3, m4
10812movu [r0 + 1200], m3
10813
10814; mode 21 [row 0, 1]
10815
10816pmaddubsw m2, m0, [r5 + 15 * 16]
10817pmulhrsw m2, m7
10818
10819pmaddubsw m3, m1, [r5 + 30 * 16]
10820pmulhrsw m3, m7
10821
10822packuswb m2, m3
10823movu [r0 + 1216], m2
10824
10825; mode 21 [row 2, 3]
10826
10827pmaddubsw m2, m1, [r5 + 13 * 16]
10828pmulhrsw m2, m7
10829
10830pslldq m3, m1, 2
10831pinsrb m3, [r2 + 2], 1
10832pinsrb m3, [r2 + 4], 0
10833
10834pmaddubsw m4, m3, [r5 + 28 * 16]
10835pmulhrsw m4, m7
10836
10837packuswb m2, m4
10838movu [r0 + 1232], m2
10839
10840; mode 21 [row 4, 5]
10841
10842pmaddubsw m2, m3, [r5 + 11 * 16]
10843pmulhrsw m2, m7
10844
10845pslldq m3, 2
10846pinsrb m3, [r2 + 4], 1
10847pinsrb m3, [r2 + 6], 0
10848
10849pmaddubsw m4, m3, [r5 + 26 * 16]
10850pmulhrsw m4, m7
10851
10852packuswb m2, m4
10853movu [r0 + 1248], m2
10854
10855; mode 21 [row 6, 7]
10856
10857pmaddubsw m2, m3, [r5 + 9 * 16]
10858pmulhrsw m2, m7
10859
10860pslldq m3, 2
10861pinsrb m3, [r2 + 6], 1
10862pinsrb m3, [r2 + 8], 0
10863
10864pmaddubsw m4, m3, [r5 + 24 * 16]
10865pmulhrsw m4, m7
10866
10867packuswb m2, m4
10868movu [r0 + 1264], m2
10869
10870; mode 22 [row 0, 1]
10871
10872pmaddubsw m2, m0, [r5 + 19 * 16]
10873pmulhrsw m2, m7
10874
10875pmaddubsw m4, m0, [r5 + 6 * 16]
10876pmulhrsw m4, m7
10877
10878packuswb m2, m4
10879movu [r0 + 1280], m2
10880
10881; mode 22 [row 2, 3]
10882
10883pmaddubsw m2, m1, [r5 + 25 * 16]
10884pmulhrsw m2, m7
10885
10886pmaddubsw m3, m1, [r5 + 12 * 16]
10887pmulhrsw m3, m7
10888
10889packuswb m2, m3
10890movu [r0 + 1296], m2
10891
10892; mode 22 [row 4, 5]
10893
10894pslldq m1, 2
10895pinsrb m1, [r2 + 5], 0
10896pinsrb m1, [r2 + 2], 1
10897
10898pmaddubsw m2, m1, [r5 + 31 * 16]
10899pmulhrsw m2, m7
10900
10901pmaddubsw m3, m1, [r5 + 18 * 16]
10902pmulhrsw m3, m7
10903
10904packuswb m2, m3
10905movu [r0 + 1312], m2
10906
10907; mode 22 [row 6, 7]
10908
10909pmaddubsw m2, m1, [r5 + 5 * 16]
10910pmulhrsw m2, m7
10911
10912pslldq m1, 2
10913pinsrb m1, [r2 + 5], 1
10914pinsrb m1, [r2 + 7], 0
10915
10916pmaddubsw m1, [r5 + 24 * 16]
10917pmulhrsw m1, m7
10918
10919packuswb m2, m1
10920movu [r0 + 1328], m2
10921
10922; mode 23 [row 0, 1]
10923
10924pmaddubsw m2, m0, [r5 + 23 * 16]
10925pmulhrsw m2, m7
10926
10927pmaddubsw m3, m0, [r5 + 14 * 16]
10928pmulhrsw m3, m7
10929
10930packuswb m2, m3
10931movu [r0 + 1344], m2
10932
10933; mode 23 [row 2, 3]
10934
10935pmaddubsw m2, m0, [r5 + 5 * 16]
10936pmulhrsw m2, m7
10937
10938pslldq m1, m0, 2
10939pinsrb m1, [r2 + 0], 1
10940pinsrb m1, [r2 + 4], 0
10941
10942pmaddubsw m3, m1, [r5 + 28 * 16]
10943pmulhrsw m3, m7
10944
10945packuswb m2, m3
10946movu [r0 + 1360], m2
10947
10948; mode 23 [row 4, 5]
10949
10950pmaddubsw m2, m1, [r5 + 19 * 16]
10951pmulhrsw m2, m7
10952
10953pmaddubsw m3, m1, [r5 + 10 * 16]
10954pmulhrsw m3, m7
10955
10956packuswb m2, m3
10957movu [r0 + 1376], m2
10958
10959; mode 23 [row 6, 7]
10960
10961pmaddubsw m2, m1, [r5 + 1 * 16]
10962pmulhrsw m2, m7
10963
10964pslldq m3, m1, 2
10965pinsrb m3, [r2 + 4], 1
10966pinsrb m3, [r2 + 7], 0
10967
10968pmaddubsw m3, [r5 + 24 * 16]
10969pmulhrsw m3, m7
10970
10971packuswb m2, m3
10972movu [r0 + 1392], m2
10973
10974; mode 24 [row 0, 1]
10975
10976pmaddubsw m2, m0, [r5 + 27 * 16]
10977pmulhrsw m2, m7
10978
10979pmaddubsw m5, m0, [r5 + 22 * 16]
10980pmulhrsw m5, m7
10981
10982packuswb m2, m5
10983movu [r0 + 1408], m2
10984
10985; mode 24 [row 2, 3]
10986
10987pmaddubsw m2, m0, [r5 + 17 * 16]
10988pmulhrsw m2, m7
10989
10990pmaddubsw m3, m0, [r5 + 12 * 16]
10991pmulhrsw m3, m7
10992
10993packuswb m2, m3
10994movu [r0 + 1424], m2
10995
10996; mode 24 [row 4, 5]
10997
10998pmaddubsw m2, m0, [r5 + 7 * 16]
10999pmulhrsw m2, m7
11000
11001pmaddubsw m3, m0, [r5 + 2 * 16]
11002pmulhrsw m3, m7
11003
11004packuswb m2, m3
11005movu [r0 + 1440], m2
11006
11007; mode 24 [row 6, 7]
11008
11009pinsrb m1, [r2 + 6], 0
11010
11011pmaddubsw m2, m1, [r5 + 29 * 16]
11012pmulhrsw m2, m7
11013
11014pmaddubsw m1, [r5 + 24 * 16]
11015pmulhrsw m1, m7
11016
11017packuswb m2, m1
11018movu [r0 + 1456], m2
11019
11020; mode 25 [row 0, 1]
11021
11022pmaddubsw m2, m0, [r5 + 30 * 16]
11023pmulhrsw m2, m7
11024
11025pmaddubsw m1, m0, [r5 + 28 * 16]
11026pmulhrsw m1, m7
11027
11028packuswb m2, m1
11029movu [r0 + 1472], m2
11030
11031; mode 25 [row 2, 3]
11032
11033pmaddubsw m2, m0, [r5 + 26 * 16]
11034pmulhrsw m2, m7
11035
11036pmaddubsw m1, m0, [r5 + 24 * 16]
11037pmulhrsw m1, m7
11038
11039packuswb m2, m1
11040movu [r0 + 1488], m2
11041
11042; mode 25 [row 4, 5]
11043
11044pmaddubsw m1, m0, [r5 + 20 * 16]
11045pmulhrsw m1, m7
11046
11047packuswb m5, m1
11048movu [r0 + 1504], m5
11049
11050; mode 25 [row 6, 7]
11051
11052pmaddubsw m2, m0, [r5 + 18 * 16]
11053pmulhrsw m2, m7
11054
11055pmaddubsw m1, m0, [r5 + 16 * 16]
11056pmulhrsw m1, m7
11057
11058packuswb m2, m1
11059movu [r0 + 1520], m2
11060
11061; mode 26
11062
11063movu m0, [r1 + 1]
11064
11065pshufb m1, m0, [tab_Si]
11066movu [r0 + 1536], m1
11067movu [r0 + 1552], m1
11068movu [r0 + 1568], m1
11069movu [r0 + 1584], m1
11070
11071pxor m5, m5
11072
11073pshufb m1, m1, m5
11074punpcklbw m1, m5
11075
11076movu m2, [r2]
11077
11078pshufb m3, m2, m5
11079punpcklbw m3, m5
11080
11081psrldq m4, m2, 1
11082punpcklbw m4, m5
11083
11084movu m2, [r2 + 9]
11085punpcklbw m2, m5
11086
11087psubw m4, m3
11088psubw m2, m3
11089
11090psraw m4, 1
11091psraw m2, 1
11092
11093paddw m4, m1
11094paddw m2, m1
11095
11096packuswb m4, m2
11097
11098pextrb [r0 + 1536], m4, 0
11099pextrb [r0 + 1544], m4, 1
11100pextrb [r0 + 1552], m4, 2
11101pextrb [r0 + 1560], m4, 3
11102pextrb [r0 + 1568], m4, 4
11103pextrb [r0 + 1576], m4, 5
11104pextrb [r0 + 1584], m4, 6
11105pextrb [r0 + 1592], m4, 7
11106
11107; mode 27 [row 0, 1]
11108
11109palignr m6, m0, 1
11110punpcklbw m4, m0, m6
11111
11112pmaddubsw m1, m4, [r5 + 2 * 16]
11113pmulhrsw m1, m7
11114
11115pmaddubsw m2, m4, [r5 + 4 * 16]
11116pmulhrsw m2, m7
11117
11118packuswb m1, m2
11119movu [r0 + 1600], m1
11120
11121; mode 27 [row 2, 3]
11122
11123pmaddubsw m1, m4, [r5 + 6 * 16]
11124pmulhrsw m1, m7
11125
11126pmaddubsw m2, m4, [r5 + 8 * 16]
11127pmulhrsw m2, m7
11128
11129packuswb m1, m2
11130movu [r0 + 1616], m1
11131
11132; mode 27 [row 4, 5]
11133
11134pmaddubsw m3, m4, [r5 + 10 * 16]
11135pmulhrsw m3, m7
11136
11137pmaddubsw m2, m4, [r5 + 12 * 16]
11138pmulhrsw m2, m7
11139
11140packuswb m1, m3, m2
11141movu [r0 + 1632], m1
11142
11143; mode 27 [row 6, 7]
11144
11145pmaddubsw m1, m4, [r5 + 14 * 16]
11146pmulhrsw m1, m7
11147
11148pmaddubsw m2, m4, [r5 + 16 * 16]
11149pmulhrsw m2, m7
11150
11151packuswb m1, m2
11152movu [r0 + 1648], m1
11153
11154; mode 28 [row 0, 1]
11155
11156pmaddubsw m1, m4, [r5 + 5 * 16]
11157pmulhrsw m1, m7
11158
11159packuswb m1, m3
11160movu [r0 + 1664], m1
11161
11162; mode 28 [row 2, 3]
11163
11164pmaddubsw m1, m4, [r5 + 15 * 16]
11165pmulhrsw m1, m7
11166
11167pmaddubsw m2, m4, [r5 + 20 * 16]
11168pmulhrsw m2, m7
11169
11170packuswb m1, m2
11171movu [r0 + 1680], m1
11172
11173; mode 28 [row 4, 5]
11174
11175pmaddubsw m1, m4, [r5 + 25 * 16]
11176pmulhrsw m1, m7
11177
11178pmaddubsw m2, m4, [r5 + 30 * 16]
11179pmulhrsw m2, m7
11180
11181packuswb m1, m2
11182movu [r0 + 1696], m1
11183
11184; mode 28 [row 6, 7]
11185
11186palignr m1, m0, 2
11187punpcklbw m5, m6, m1
11188
11189pmaddubsw m2, m5, [r5 + 3 * 16]
11190pmulhrsw m2, m7
11191
11192pmaddubsw m3, m5, [r5 + 8 * 16]
11193pmulhrsw m3, m7
11194
11195packuswb m2, m3
11196movu [r0 + 1712], m2
11197
11198; mode 29 [row 0, 1]
11199
11200pmaddubsw m2, m4, [r5 + 9 * 16]
11201pmulhrsw m2, m7
11202
11203pmaddubsw m3, m4, [r5 + 18 * 16]
11204pmulhrsw m3, m7
11205
11206packuswb m2, m3
11207movu [r0 + 1728], m2
11208
11209; mode 29 [row 2, 3]
11210
11211pmaddubsw m2, m4, [r5 + 27 * 16]
11212pmulhrsw m2, m7
11213
11214pmaddubsw m3, m5, [r5 + 4 * 16]
11215pmulhrsw m3, m7
11216
11217packuswb m2, m3
11218movu [r0 + 1744], m2
11219
11220; mode 29 [row 4, 5]
11221
11222pmaddubsw m2, m5, [r5 + 13 * 16]
11223pmulhrsw m2, m7
11224
11225pmaddubsw m3, m5, [r5 + 22 * 16]
11226pmulhrsw m3, m7
11227
11228packuswb m2, m3
11229movu [r0 + 1760], m2
11230
11231; mode 29 [row 6, 7]
11232
11233pmaddubsw m2, m5, [r5 + 31 * 16]
11234pmulhrsw m2, m7
11235
11236palignr m6, m0, 3
11237punpcklbw m1, m6
11238
11239pmaddubsw m3, m1, [r5 + 8 * 16]
11240pmulhrsw m3, m7
11241
11242packuswb m2, m3
11243movu [r0 + 1776], m2
11244
11245; mode 32 [row 2]
11246
11247movh [r0 + 1936], m2
11248
11249; mode 30 [row 0, 1]
11250
11251pmaddubsw m2, m4, [r5 + 13 * 16]
11252pmulhrsw m2, m7
11253
11254pmaddubsw m3, m4, [r5 + 26 * 16]
11255pmulhrsw m3, m7
11256
11257packuswb m2, m3
11258movu [r0 + 1792], m2
11259
11260; mode 30 [row 2, 3]
11261
11262pmaddubsw m2, m5, [r5 + 7 * 16]
11263pmulhrsw m2, m7
11264
11265pmaddubsw m3, m5, [r5 + 20 * 16]
11266pmulhrsw m3, m7
11267
11268packuswb m2, m3
11269movu [r0 + 1808], m2
11270
11271; mode 33 [row 1]
11272
11273movhps [r0 + 1992], m2
11274
11275; mode 30 [row 4, 5]
11276
11277pmaddubsw m2, m1, [r5 + 1 * 16]
11278pmulhrsw m2, m7
11279
11280pmaddubsw m3, m1, [r5 + 14 * 16]
11281pmulhrsw m3, m7
11282
11283packuswb m2, m3
11284movu [r0 + 1824], m2
11285
11286; mode 33 [row 2]
11287
11288movhps [r0 + 2000], m2
11289
11290; mode 30 [row 6, 7]
11291
11292pmaddubsw m2, m1, [r5 + 27 * 16]
11293pmulhrsw m2, m7
11294
11295psrldq m0, 4
11296punpcklbw m6, m0
11297
11298pmaddubsw m3, m6, [r5 + 8 * 16]
11299pmulhrsw m3, m7
11300
11301packuswb m2, m3
11302movu [r0 + 1840], m2
11303
11304; mode 33 [row 3]
11305
11306movhps [r0 + 2008], m2
11307
11308; mode 31 [row 0, 1]
11309
11310pmaddubsw m2, m4, [r5 + 17 * 16]
11311pmulhrsw m2, m7
11312
11313pmaddubsw m3, m5, [r5 + 2 * 16]
11314pmulhrsw m3, m7
11315
11316packuswb m2, m3
11317movu [r0 + 1856], m2
11318
11319; mode 31 [row 2, 3]
11320
11321pmaddubsw m2, m5, [r5 + 19 * 16]
11322pmulhrsw m2, m7
11323
11324pmaddubsw m3, m1, [r5 + 4 * 16]
11325pmulhrsw m3, m7
11326
11327packuswb m2, m3
11328movu [r0 + 1872], m2
11329
11330; mode 31 [row 4, 5]
11331
11332pmaddubsw m2, m1, [r5 + 21 * 16]
11333pmulhrsw m2, m7
11334
11335pmaddubsw m3, m6, [r5 + 6 * 16]
11336pmulhrsw m3, m7
11337
11338packuswb m2, m3
11339movu [r0 + 1888], m2
11340
11341; mode 31 [row 6, 7]
11342
11343pmaddubsw m2, m6, [r5 + 23 * 16]
11344pmulhrsw m2, m7
11345
11346movu m3, [r1 + 6]
11347punpcklbw m0, m3
11348
11349pmaddubsw m3, m0, [r5 + 8 * 16]
11350pmulhrsw m3, m7
11351
11352packuswb m2, m3
11353movu [r0 + 1904], m2
11354
11355; mode 32 [row 0, 1]
11356
11357pmaddubsw m2, m4, [r5 + 21 * 16]
11358pmulhrsw m2, m7
11359
11360pmaddubsw m3, m5, [r5 + 10 * 16]
11361pmulhrsw m3, m7
11362
11363packuswb m2, m3
11364movu [r0 + 1920], m2
11365
11366; mode 32 [row 3]
11367
11368pmaddubsw m2, m1, [r5 + 20 * 16]
11369pmulhrsw m2, m7
11370
11371pxor m3, m3
11372
11373packuswb m2, m3
11374movh [r0 + 1944], m2
11375
11376; mode 32 [row 4, 5]
11377
11378pmaddubsw m2, m6, [r5 + 9 * 16]
11379pmulhrsw m2, m7
11380
11381pmaddubsw m3, m6, [r5 + 30 * 16]
11382pmulhrsw m3, m7
11383
11384packuswb m2, m3
11385movu [r0 + 1952], m2
11386
11387; mode 33 [row 4, 5]
11388
11389pmaddubsw m2, m0, [r5 + 2 * 16]
11390pmulhrsw m2, m7
11391
11392pmaddubsw m3, m0, [r5 + 28 * 16]
11393pmulhrsw m3, m7
11394
11395packuswb m2, m3
11396movu [r0 + 2016], m2
11397
11398; mode 32 [row 6]
11399
11400pmaddubsw m2, m0, [r5 + 19 * 16]
11401pmulhrsw m2, m7
11402
11403; mode 32 [row 7]
11404
11405movu m0, [r1 + 6]
11406palignr m3, m0, 1
11407punpcklbw m0, m3
11408
11409pmaddubsw m3, m0, [r5 + 8 * 16]
11410pmulhrsw m3, m7
11411
11412packuswb m2, m3
11413movu [r0 + 1968], m2
11414
11415; mode 33 [row 6, 7]
11416
11417pmaddubsw m2, m0, [r5 + 22 * 16]
11418pmulhrsw m2, m7
11419
11420movu m0, [r1 + 7]
11421palignr m3, m0, 1
11422punpcklbw m0, m3
11423
11424pmaddubsw m3, m0, [r5 + 16 * 16]
11425pmulhrsw m3, m7
11426
11427packuswb m2, m3
11428movu [r0 + 2032], m2
11429
11430; mode 33 [row 0]
11431
11432pmaddubsw m2, m4, [r5 + 26 * 16]
11433pmulhrsw m2, m7
11434
11435pxor m3, m3
11436
11437packuswb m2, m3
11438movh [r0 + 1984], m2
11439
11440; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7]
11441
11442movu m0, [r3 + 2]
11443palignr m1, m0, 1
11444punpcklqdq m2, m0, m1
11445movu [r0 + 2048], m2
11446
11447palignr m1, m0, 2
11448palignr m2, m0, 3
11449punpcklqdq m1, m2
11450movu [r0 + 2064], m1
11451
11452palignr m1, m0, 4
11453palignr m2, m0, 5
11454punpcklqdq m1, m2
11455movu [r0 + 2080], m1
11456
11457palignr m1, m0, 6
11458palignr m2, m0, 7
11459punpcklqdq m1, m2
11460movu [r0 + 2096], m1
11461
11462RET
11463
11464;-----------------------------------------------------------------------------
11465; void all_angs_pred_16x16(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
11466;-----------------------------------------------------------------------------
11467INIT_XMM sse4
11468cglobal all_angs_pred_16x16, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
11469
11470movu m0, [r4 + 2]
11471movu [r0 + 0 * 16], m0
11472
11473movu m1, m0
11474
11475movu m6, [r4 + 18]
11476palignr m5, m6, m0, 1
11477movu [r0 + 1 * 16], m5
11478
11479movu m4, m5
11480
11481palignr m5, m6, m0, 2
11482movu [r0 + 2 * 16], m5
11483palignr m5, m6, m0, 3
11484movu [r0 + 3 * 16], m5
11485palignr m5, m6, m0, 4
11486movu [r0 + 4 * 16], m5
11487palignr m5, m6, m0, 5
11488movu [r0 + 5 * 16], m5
11489palignr m5, m6, m0, 6
11490movu [r0 + 6 * 16], m5
11491palignr m5, m6, m0, 7
11492movu [r0 + 7 * 16], m5
11493
11494movu m7, m5
11495
11496palignr m5, m6, m0, 8
11497movu [r0 + 8 * 16], m5
11498
11499movu m2, m5
11500
11501palignr m5, m6, m0, 9
11502movu [r0 + 9 * 16], m5
11503
11504palignr m3, m6, m0, 10
11505movu [r0 + 10 * 16], m3
11506palignr m3, m6, m0, 11
11507movu [r0 + 11 * 16], m3
11508palignr m3, m6, m0, 12
11509movu [r0 + 12 * 16], m3
11510
11511; mode 3 [row 15]
11512movu [r0 + (3-2)*16*16 + 15 * 16], m3
11513
11514palignr m3, m6, m0, 13
11515movu [r0 + 13 * 16], m3
11516palignr m3, m6, m0, 14
11517movu [r0 + 14 * 16], m3
11518palignr m3, m6, m0, 15
11519movu [r0 + 15 * 16], m3
11520
11521; mode 3 [row 0]
11522lea r5, [ang_table]
11523movu m3, [pw_1024]
11524movu m0, [r4 + 1]
11525punpcklbw m0, m1
11526
11527; mode 17 [row 8 - second half]
11528pmaddubsw m1, m0, [r5 + 22 * 16]
11529pmulhrsw m1, m3
11530packuswb m1, m1
11531movh [r0 + 248 * 16 + 8], m1
11532; mode 17 [row 8 - second half] end
11533
11534pmaddubsw m1, m0, [r5 + 26 * 16]
11535pmulhrsw m1, m3
11536punpcklbw m7, m2
11537pmaddubsw m2, m7, [r5 + 26 * 16]
11538pmulhrsw m2, m3
11539packuswb m1, m2
11540movu [r0 + 16 * 16], m1
11541
11542;mode 6 [row 1]
11543movu [r0 + 65 * 16], m1
11544
11545; mode 4 [row 0]
11546pmaddubsw m1, m0, [r5 + 21 * 16]
11547pmulhrsw m1, m3
11548pmaddubsw m2, m7, [r5 + 21 * 16]
11549pmulhrsw m2, m3
11550packuswb m1, m2
11551movu [r0 + 32 * 16], m1
11552
11553; mode 5 [row 0]
11554pmaddubsw m1, m0, [r5 + 17 * 16]
11555pmulhrsw m1, m3
11556pmaddubsw m2, m7, [r5 + 17 * 16]
11557pmulhrsw m2, m3
11558packuswb m1, m2
11559movu [r0 + 48 * 16], m1
11560
11561; mode 6 [row 0]
11562pmaddubsw m1, m0, [r5 + 13 * 16]
11563pmulhrsw m1, m3
11564pmaddubsw m2, m7, [r5 + 13 * 16]
11565pmulhrsw m2, m3
11566packuswb m1, m2
11567movu [r0 + 64 * 16], m1
11568
11569; mode 7 [row 0]
11570pmaddubsw m1, m0, [r5 + 9 * 16]
11571pmulhrsw m1, m3
11572pmaddubsw m2, m7, [r5 + 9 * 16]
11573pmulhrsw m2, m3
11574packuswb m1, m2
11575movu [r0 + 80 * 16], m1
11576
11577; mode 7 [row 1]
11578pmaddubsw m1, m0, [r5 + 18 * 16]
11579pmulhrsw m1, m3
11580pmaddubsw m2, m7, [r5 + 18 * 16]
11581pmulhrsw m2, m3
11582packuswb m1, m2
11583movu [r0 + 81 * 16], m1
11584
11585; mode 7 [row 2]
11586pmaddubsw m1, m0, [r5 + 27 * 16]
11587pmulhrsw m1, m3
11588pmaddubsw m2, m7, [r5 + 27 * 16]
11589pmulhrsw m2, m3
11590packuswb m1, m2
11591movu [r0 + 82 * 16], m1
11592
11593; mode 8 [row 0]
11594pmaddubsw m1, m0, [r5 + 5 * 16]
11595pmulhrsw m1, m3
11596pmaddubsw m2, m7, [r5 + 5 * 16]
11597pmulhrsw m2, m3
11598packuswb m1, m2
11599movu [r0 + 96 * 16], m1
11600
11601; mode 8 [row 1]
11602pmaddubsw m1, m0, [r5 + 10 * 16]
11603pmulhrsw m1, m3
11604pmaddubsw m2, m7, [r5 + 10 * 16]
11605pmulhrsw m2, m3
11606packuswb m1, m2
11607movu [r0 + 97 * 16], m1
11608
11609; mode 8 [row 2]
11610pmaddubsw m1, m0, [r5 + 15 * 16]
11611pmulhrsw m1, m3
11612pmaddubsw m2, m7, [r5 + 15 * 16]
11613pmulhrsw m2, m3
11614packuswb m1, m2
11615movu [r0 + 98 * 16], m1
11616
11617; mode 8 [row 3]
11618pmaddubsw m1, m0, [r5 + 20 * 16]
11619pmulhrsw m1, m3
11620pmaddubsw m2, m7, [r5 + 20 * 16]
11621pmulhrsw m2, m3
11622packuswb m1, m2
11623movu [r0 + 99 * 16], m1
11624
11625; mode 8 [row 4]
11626pmaddubsw m1, m0, [r5 + 25 * 16]
11627pmulhrsw m1, m3
11628pmaddubsw m2, m7, [r5 + 25 * 16]
11629pmulhrsw m2, m3
11630packuswb m1, m2
11631movu [r0 + 100 * 16], m1
11632
11633; mode 8 [row 5]
11634pmaddubsw m1, m0, [r5 + 30 * 16]
11635pmulhrsw m1, m3
11636pmaddubsw m2, m7, [r5 + 30 * 16]
11637pmulhrsw m2, m3
11638packuswb m1, m2
11639movu [r0 + 101 * 16], m1
11640
11641; mode 15 [row 13 - second half]
11642pmaddubsw m1, m0, [r5 + 18 * 16]
11643pmulhrsw m1, m3
11644packuswb m1, m1
11645movh [r0 + 221 * 16 + 8], m1
11646; mode 15 [row 13 - second half] end
11647
11648; mode 15 [row 14 - second half]
11649pmaddubsw m1, m0, [r5 + 1 * 16]
11650pmulhrsw m1, m3
11651packuswb m1, m1
11652movh [r0 + 222 * 16 + 8], m1
11653; mode 15 [row 14 - second half] end
11654
11655; mode 16 [row 10 - second half]
11656pmaddubsw m1, m0, [r5 + 25 * 16]
11657pmulhrsw m1, m3
11658packuswb m1, m1
11659movh [r0 + 234 * 16 + 8], m1
11660; mode 16 [row 10 - second half] end
11661
11662; mode 16 [row 11 - second half]
11663pmaddubsw m1, m0, [r5 + 4 * 16]
11664pmulhrsw m1, m3
11665packuswb m1, m1
11666movh [r0 + 235 * 16 + 8], m1
11667; mode 16 [row 11 - second half] end
11668
11669; mode 3 [row 1]
11670movu m6, [r5 + 20 * 16]
11671movu m0, [r4 + 2]
11672punpcklbw m0, m4
11673
11674; mode 17 [row 7 - second half]
11675pmaddubsw m1, m0, [r5 + 16 * 16]
11676pmulhrsw m1, m3
11677packuswb m1, m1
11678movh [r0 + 247 * 16 + 8], m1
11679
11680; mode 17 [row 7 - second half] end
11681pmaddubsw m1, m0, m6
11682pmulhrsw m1, m3
11683movu m2, [r4 + 10]
11684punpcklbw m2, m5
11685pmaddubsw m4, m2, m6
11686pmulhrsw m4, m3
11687packuswb m1, m4
11688movu [r0 + 17 * 16], m1
11689
11690;mode 6 [row 3]
11691movu [r0 + 67 * 16], m1
11692
11693; mode 4 row [row 1]
11694pmaddubsw m1, m0, [r5 + 10 * 16]
11695pmulhrsw m1, m3
11696pmaddubsw m4, m2, [r5 + 10 * 16]
11697pmulhrsw m4, m3
11698packuswb m1, m4
11699movu [r0 + 33 * 16], m1
11700
11701; mode 4 row [row 2]
11702pmaddubsw m1, m0, [r5 + 31 * 16]
11703pmulhrsw m1, m3
11704pmaddubsw m4, m2, [r5 + 31 * 16]
11705pmulhrsw m4, m3
11706packuswb m1, m4
11707movu [r0 + 34 * 16], m1
11708
11709; mode 7 [row 6]
11710movu [r0 + 86 * 16], m1
11711
11712; mode 5 row [row 1]
11713pmaddubsw m1, m0, [r5 + 2 * 16]
11714pmulhrsw m1, m3
11715pmaddubsw m4, m2, [r5 + 2 * 16]
11716pmulhrsw m4, m3
11717packuswb m1, m4
11718movu [r0 + 49 * 16], m1
11719
11720; mode 5 row [row 2]
11721pmaddubsw m1, m0, [r5 + 19 * 16]
11722pmulhrsw m1, m3
11723pmaddubsw m4, m2, [r5 + 19 * 16]
11724pmulhrsw m4, m3
11725packuswb m1, m4
11726movu [r0 + 50 * 16], m1
11727
11728; mode 6 [row 2]
11729pmaddubsw m1, m0, [r5 + 7 * 16]
11730pmulhrsw m1, m3
11731pmaddubsw m4, m2, [r5 + 7 * 16]
11732pmulhrsw m4, m3
11733packuswb m1, m4
11734movu [r0 + 66 * 16], m1
11735
11736; mode 7 [row 3]
11737pmaddubsw m1, m0, [r5 + 4 * 16]
11738pmulhrsw m1, m3
11739pmaddubsw m4, m2, [r5 + 4 * 16]
11740pmulhrsw m4, m3
11741packuswb m1, m4
11742movu [r0 + 83 * 16], m1
11743
11744; mode 7 [row 4]
11745pmaddubsw m1, m0, [r5 + 13 * 16]
11746pmulhrsw m1, m3
11747pmaddubsw m4, m2, [r5 + 13 * 16]
11748pmulhrsw m4, m3
11749packuswb m1, m4
11750movu [r0 + 84 * 16], m1
11751
11752; mode 8 [row 8]
11753movu [r0 + 104 * 16], m1
11754
11755; mode 7 [row 5]
11756pmaddubsw m1, m0, [r5 + 22 * 16]
11757pmulhrsw m1, m3
11758pmaddubsw m4, m2, [r5 + 22 * 16]
11759pmulhrsw m4, m3
11760packuswb m1, m4
11761movu [r0 + 85 * 16], m1
11762
11763; mode 8 [row 6]
11764pmaddubsw m1, m0, [r5 + 3 * 16]
11765pmulhrsw m1, m3
11766pmaddubsw m4, m2, [r5 + 3 * 16]
11767pmulhrsw m4, m3
11768packuswb m1, m4
11769movu [r0 + 102 * 16], m1
11770
11771; mode 8 [row 7]
11772pmaddubsw m1, m0, [r5 + 8 * 16]
11773pmulhrsw m1, m3
11774pmaddubsw m4, m2, [r5 + 8 * 16]
11775pmulhrsw m4, m3
11776packuswb m1, m4
11777movu [r0 + 103 * 16], m1
11778
11779; mode 8 [row 9]
11780pmaddubsw m1, m0, [r5 + 18 * 16]
11781pmulhrsw m1, m3
11782pmaddubsw m4, m2, [r5 + 18 * 16]
11783pmulhrsw m4, m3
11784packuswb m1, m4
11785movu [r0 + 105 * 16], m1
11786
11787; mode 8 [row 10]
11788pmaddubsw m1, m0, [r5 + 23 * 16]
11789pmulhrsw m1, m3
11790pmaddubsw m4, m2, [r5 + 23 * 16]
11791pmulhrsw m4, m3
11792packuswb m1, m4
11793movu [r0 + 106 * 16], m1
11794
11795; mode 8 [row 11]
11796pmaddubsw m1, m0, [r5 + 28 * 16]
11797pmulhrsw m1, m3
11798pmaddubsw m4, m2, [r5 + 28 * 16]
11799pmulhrsw m4, m3
11800packuswb m1, m4
11801movu [r0 + 107 * 16], m1
11802
11803; mode 3 [row 2]
11804movu m0, [r4 + 3]
11805movd m1, [r4 + 19]
11806palignr m1, m0, 1
11807punpcklbw m0, m1
11808
11809; mode 17 [row 6 - second half]
11810pmaddubsw m1, m0, [r5 + 10 * 16]
11811pmulhrsw m1, m3
11812packuswb m1, m1
11813movh [r0 + 246 * 16 + 8], m1
11814; mode 17 [row 6 - second half] end
11815
11816pmaddubsw m1, m0, [r5 + 14 * 16]
11817pmulhrsw m1, m3
11818movu m2, [r4 + 11]
11819movd m4, [r4 + 27]
11820palignr m4, m2, 1
11821punpcklbw m2, m4
11822pmaddubsw m4, m2, [r5 + 14 * 16]
11823pmulhrsw m4, m3
11824packuswb m1, m4
11825movu [r0 + 18 * 16], m1
11826
11827; mode 6 [row 5]
11828movu [r0 + 69 * 16], m1
11829
11830; mode 4 row [row 3]
11831pmaddubsw m1, m0, [r5 + 20 * 16]
11832pmulhrsw m1, m3
11833pmaddubsw m4, m2, [r5 + 20 * 16]
11834pmulhrsw m4, m3
11835packuswb m1, m4
11836movu [r0 + 35 * 16], m1
11837
11838; mode 5 row [row 3]
11839pmaddubsw m1, m0, [r5 + 4 * 16]
11840pmulhrsw m1, m3
11841pmaddubsw m4, m2, [r5 + 4 * 16]
11842pmulhrsw m4, m3
11843packuswb m1, m4
11844movu [r0 + 51 * 16], m1
11845
11846; mode 5 row [row 4]
11847pmaddubsw m1, m0, [r5 + 21 * 16]
11848pmulhrsw m1, m3
11849pmaddubsw m4, m2, [r5 + 21 * 16]
11850pmulhrsw m4, m3
11851packuswb m1, m4
11852movu [r0 + 52 * 16], m1
11853
11854; mode 6 [row 4]
11855pmaddubsw m1, m0, [r5 + 1 * 16]
11856pmulhrsw m1, m3
11857pmaddubsw m4, m2, [r5 + 1 * 16]
11858pmulhrsw m4, m3
11859packuswb m1, m4
11860movu [r0 + 68 * 16], m1
11861
11862; mode 6 [row 6]
11863pmaddubsw m1, m0, [r5 + 27 * 16]
11864pmulhrsw m1, m3
11865pmaddubsw m4, m2, [r5 + 27 * 16]
11866pmulhrsw m4, m3
11867packuswb m1, m4
11868movu [r0 + 70 * 16], m1
11869
11870; mode 7 [row 7]
11871pmaddubsw m1, m0, [r5 + 8 * 16]
11872pmulhrsw m1, m3
11873pmaddubsw m4, m2, [r5 + 8 * 16]
11874pmulhrsw m4, m3
11875packuswb m1, m4
11876movu [r0 + 87 * 16], m1
11877
11878; mode 7 [row 8]
11879pmaddubsw m1, m0, [r5 + 17 * 16]
11880pmulhrsw m1, m3
11881pmaddubsw m4, m2, [r5 + 17 * 16]
11882pmulhrsw m4, m3
11883packuswb m1, m4
11884movu [r0 + 88 * 16], m1
11885
11886; mode 7 [row 9]
11887pmaddubsw m1, m0, [r5 + 26 * 16]
11888pmulhrsw m1, m3
11889pmaddubsw m4, m2, [r5 + 26 * 16]
11890pmulhrsw m4, m3
11891packuswb m1, m4
11892movu [r0 + 89 * 16], m1
11893
11894; mode 8 [row 12]
11895pmaddubsw m1, m0, [r5 + 1 * 16]
11896pmulhrsw m1, m3
11897pmaddubsw m4, m2, [r5 + 1 * 16]
11898pmulhrsw m4, m3
11899packuswb m1, m4
11900movu [r0 + 108 * 16], m1
11901
11902; mode 8 [row 13]
11903pmaddubsw m1, m0, [r5 + 6 * 16]
11904pmulhrsw m1, m3
11905pmaddubsw m4, m2, [r5 + 6 * 16]
11906pmulhrsw m4, m3
11907packuswb m1, m4
11908movu [r0 + 109 * 16], m1
11909
11910; mode 8 [row 14]
11911pmaddubsw m1, m0, [r5 + 11 * 16]
11912pmulhrsw m1, m3
11913pmaddubsw m4, m2, [r5 + 11 * 16]
11914pmulhrsw m4, m3
11915packuswb m1, m4
11916movu [r0 + 110 * 16], m1
11917
11918; mode 8 [row 15]
11919pmaddubsw m1, m0, [r5 + 16 * 16]
11920pmulhrsw m1, m3
11921pmaddubsw m4, m2, [r5 + 16 * 16]
11922pmulhrsw m4, m3
11923packuswb m1, m4
11924movu [r0 + 111 * 16], m1
11925
11926; mode 3 [row 3]
11927movu m0, [r4 + 4]
11928movd m1, [r4 + 20]
11929palignr m1, m0, 1
11930punpcklbw m0, m1
11931
11932; mode 17 [row 4 - second half]
11933pmaddubsw m1, m0, [r5 + 30 * 16]
11934pmulhrsw m1, m3
11935packuswb m1, m1
11936movh [r0 + 244 * 16 + 8], m1
11937; mode 17 [row 4 - second half] end
11938
11939; mode 17 [row 5 - second half]
11940pmaddubsw m1, m0, [r5 + 4 * 16]
11941pmulhrsw m1, m3
11942packuswb m1, m1
11943movh [r0 + 245 * 16 + 8], m1
11944; mode 17 [row 5 - second half] end
11945
11946pmaddubsw m1, m0, [r5 + 8 * 16]
11947pmulhrsw m1, m3
11948movu m2, [r4 + 12]
11949movd m4, [r4 + 28]
11950palignr m4, m2, 1
11951punpcklbw m2, m4
11952pmaddubsw m4, m2, [r5 + 8 * 16]
11953pmulhrsw m4, m3
11954packuswb m1, m4
11955movu [r0 + 19 * 16], m1
11956
11957; mode 6 [row 7]
11958movu [r0 + 71 * 16], m1
11959
11960; mode 4 row [row 4]
11961pmaddubsw m1, m0, [r5 + 9 * 16]
11962pmulhrsw m1, m3
11963pmaddubsw m4, m2, [r5 + 9 * 16]
11964pmulhrsw m4, m3
11965packuswb m1, m4
11966movu [r0 + 36 * 16], m1
11967
11968; mode 4 row [row 5]
11969pmaddubsw m1, m0, [r5 + 30 * 16]
11970pmulhrsw m1, m3
11971pmaddubsw m4, m2, [r5 + 30 * 16]
11972pmulhrsw m4, m3
11973packuswb m1, m4
11974movu [r0 + 37 * 16], m1
11975
11976; mode 7 row [row 13]
11977movu [r0 + 93 * 16], m1
11978
11979; mode 5 row [row 5]
11980pmaddubsw m1, m0, [r5 + 6 * 16]
11981pmulhrsw m1, m3
11982pmaddubsw m4, m2, [r5 + 6 * 16]
11983pmulhrsw m4, m3
11984packuswb m1, m4
11985movu [r0 + 53 * 16], m1
11986
11987; mode 5 row [row 6]
11988pmaddubsw m1, m0, [r5 + 23 * 16]
11989pmulhrsw m1, m3
11990pmaddubsw m4, m2, [r5 + 23 * 16]
11991pmulhrsw m4, m3
11992packuswb m1, m4
11993movu [r0 + 54 * 16], m1
11994
11995; mode 6 [row 8]
11996pmaddubsw m1, m0, [r5 + 21 * 16]
11997pmulhrsw m1, m3
11998pmaddubsw m4, m2, [r5 + 21 * 16]
11999pmulhrsw m4, m3
12000packuswb m1, m4
12001movu [r0 + 72 * 16], m1
12002
12003; mode 7 [row 12]
12004movu [r0 + 92 * 16], m1
12005
12006; mode 7 [row 10]
12007pmaddubsw m1, m0, [r5 + 3 * 16]
12008pmulhrsw m1, m3
12009pmaddubsw m4, m2, [r5 + 3 * 16]
12010pmulhrsw m4, m3
12011packuswb m1, m4
12012movu [r0 + 90 * 16], m1
12013
12014; mode 7 [row 11]
12015pmaddubsw m1, m0, [r5 + 12 * 16]
12016pmulhrsw m1, m3
12017pmaddubsw m4, m2, [r5 + 12 * 16]
12018pmulhrsw m4, m3
12019packuswb m1, m4
12020movu [r0 + 91 * 16], m1
12021
12022; mode 3 [row 4]
12023movu m0, [r4 + 5]
12024movd m1, [r4 + 20]
12025palignr m1, m0, 1
12026punpcklbw m0, m1
12027
12028; mode 17 [row 3 - second half]
12029pmaddubsw m1, m0, [r5 + 24 * 16]
12030pmulhrsw m1, m3
12031packuswb m1, m1
12032movh [r0 + 243 * 16 + 8], m1
12033
12034; mode 17 [row 3 - second half] end
12035pmaddubsw m1, m0, [r5 + 2 * 16]
12036pmulhrsw m1, m3
12037movu m2, [r4 + 13]
12038movd m4, [r4 + 29]
12039palignr m4, m2, 1
12040punpcklbw m2, m4
12041pmaddubsw m4, m2, [r5 + 2 * 16]
12042pmulhrsw m4, m3
12043packuswb m1, m4
12044movu [r0 + 20 * 16], m1
12045
12046;mode 6 [row 9]
12047movu [r0 + 73 * 16], m1
12048
12049; mode 4 row [row 6]
12050movu m6, [r5 + 19 * 16]
12051pmaddubsw m1, m0, m6
12052pmulhrsw m1, m3
12053pmaddubsw m4, m2, m6
12054pmulhrsw m4, m3
12055packuswb m1, m4
12056movu [r0 + 38 * 16], m1
12057
12058; mode 3 [row 5]
12059pmaddubsw m1, m0, [r5 + 28 * 16]
12060pmulhrsw m1, m3
12061pmaddubsw m4, m2, [r5 + 28 * 16]
12062pmulhrsw m4, m3
12063packuswb m1, m4
12064movu [r0 + 21 * 16], m1
12065
12066;mode 6 [row 11]
12067movu [r0 + 75 * 16], m1
12068
12069; mode 5 row [row 7]
12070pmaddubsw m1, m0, [r5 + 8 * 16]
12071pmulhrsw m1, m3
12072pmaddubsw m4, m2, [r5 + 8 * 16]
12073pmulhrsw m4, m3
12074packuswb m1, m4
12075movu [r0 + 55 * 16], m1
12076
12077; mode 5 row [row 8]
12078pmaddubsw m1, m0, [r5 + 25 * 16]
12079pmulhrsw m1, m3
12080pmaddubsw m4, m2, [r5 + 25 * 16]
12081pmulhrsw m4, m3
12082packuswb m1, m4
12083movu [r0 + 56 * 16], m1
12084
12085; mode 6 [row 10]
12086pmaddubsw m1, m0, [r5 + 15 * 16]
12087pmulhrsw m1, m3
12088pmaddubsw m4, m2, [r5 + 15 * 16]
12089pmulhrsw m4, m3
12090packuswb m1, m4
12091movu [r0 + 74 * 16], m1
12092
12093; mode 7 [row 14]
12094pmaddubsw m1, m0, [r5 + 7 * 16]
12095pmulhrsw m1, m3
12096pmaddubsw m4, m2, [r5 + 7 * 16]
12097pmulhrsw m4, m3
12098packuswb m1, m4
12099movu [r0 + 94 * 16], m1
12100
12101; mode 7 [row 15]
12102pmaddubsw m1, m0, [r5 + 16 * 16]
12103pmulhrsw m1, m3
12104pmaddubsw m4, m2, [r5 + 16 * 16]
12105pmulhrsw m4, m3
12106packuswb m1, m4
12107movu [r0 + 95 * 16], m1
12108
12109; mode 3 [row 6]
12110movu m0, [r4 + 6]
12111movd m1, [r4 + 22]
12112palignr m1, m0, 1
12113punpcklbw m0, m1
12114
12115; mode 17 [row 2 - second half]
12116pmaddubsw m1, m0, [r5 + 18 * 16]
12117pmulhrsw m1, m3
12118packuswb m1, m1
12119movh [r0 + 242 * 16 + 8], m1
12120; mode 17 [row 2 - second half] end
12121
12122pmaddubsw m1, m0, [r5 + 22 * 16]
12123pmulhrsw m1, m3
12124movu m2, [r4 + 14]
12125movd m4, [r4 + 30]
12126palignr m4, m2, 1
12127punpcklbw m2, m4
12128pmaddubsw m4, m2, [r5 + 22 * 16]
12129pmulhrsw m4, m3
12130packuswb m1, m4
12131movu [r0 + 22 * 16], m1
12132
12133; mode 6 [row 13]
12134movu [r0 + 77 * 16], m1
12135
12136; mode 4 row [row 7]
12137pmaddubsw m1, m0, [r5 + 8 * 16]
12138pmulhrsw m1, m3
12139pmaddubsw m4, m2, [r5 + 8 * 16]
12140pmulhrsw m4, m3
12141packuswb m1, m4
12142movu [r0 + 39 * 16], m1
12143
12144; mode 4 row [row 8]
12145pmaddubsw m1, m0, [r5 + 29 * 16]
12146pmulhrsw m1, m3
12147pmaddubsw m4, m2, [r5 + 29 * 16]
12148pmulhrsw m4, m3
12149packuswb m1, m4
12150movu [r0 + 40 * 16], m1
12151
12152; mode 5 row [row 9]
12153pmaddubsw m1, m0, [r5 + 10 * 16]
12154pmulhrsw m1, m3
12155pmaddubsw m4, m2, [r5 + 10 * 16]
12156pmulhrsw m4, m3
12157packuswb m1, m4
12158movu [r0 + 57 * 16], m1
12159
12160; mode 5 row [row 10]
12161pmaddubsw m1, m0, [r5 + 27 * 16]
12162pmulhrsw m1, m3
12163pmaddubsw m4, m2, [r5 + 27 * 16]
12164pmulhrsw m4, m3
12165packuswb m1, m4
12166movu [r0 + 58 * 16], m1
12167
12168; mode 6 [row 12]
12169pmaddubsw m1, m0, [r5 + 9 * 16]
12170pmulhrsw m1, m3
12171pmaddubsw m4, m2, [r5 + 9 * 16]
12172pmulhrsw m4, m3
12173packuswb m1, m4
12174movu [r0 + 76 * 16], m1
12175
12176; mode 3 [row 7]
12177movu m0, [r4 + 7]
12178movd m1, [r4 + 27]
12179palignr m1, m0, 1
12180punpcklbw m0, m1
12181
12182; mode 17 [row 1 - second half]
12183pmaddubsw m1, m0, [r5 + 12 * 16]
12184pmulhrsw m1, m3
12185packuswb m1, m1
12186movh [r0 + 241 * 16 + 8], m1
12187; mode 17 [row 1 - second half] end
12188
12189pmaddubsw m1, m0, [r5 + 16 * 16]
12190pmulhrsw m1, m3
12191movu m2, [r4 + 15]
12192movd m4, [r4 + 25]
12193palignr m4, m2, 1
12194punpcklbw m2, m4
12195pmaddubsw m4, m2, [r5 + 16 * 16]
12196pmulhrsw m4, m3
12197packuswb m1, m4
12198movu [r0 + 23 * 16], m1
12199
12200; mode 6 [row 15]
12201movu [r0 + 79 * 16], m1
12202
12203; mode 4 row [row 9]
12204pmaddubsw m1, m0, [r5 + 18 * 16]
12205pmulhrsw m1, m3
12206pmaddubsw m4, m2, [r5 + 18 * 16]
12207pmulhrsw m4, m3
12208packuswb m1, m4
12209movu [r0 + 41 * 16], m1
12210
12211; mode 5 row [row 11]
12212pmaddubsw m1, m0, [r5 + 12 * 16]
12213pmulhrsw m1, m3
12214pmaddubsw m4, m2, [r5 + 12 * 16]
12215pmulhrsw m4, m3
12216packuswb m1, m4
12217movu [r0 + 59 * 16], m1
12218
12219; mode 5 row [row 12]
12220pmaddubsw m1, m0, [r5 + 29 * 16]
12221pmulhrsw m1, m3
12222pmaddubsw m4, m2, [r5 + 29 * 16]
12223pmulhrsw m4, m3
12224packuswb m1, m4
12225movu [r0 + 60 * 16], m1
12226
12227; mode 6 [row 14]
12228pmaddubsw m1, m0, [r5 + 3 * 16]
12229pmulhrsw m1, m3
12230pmaddubsw m4, m2, [r5 + 3 * 16]
12231pmulhrsw m4, m3
12232packuswb m1, m4
12233movu [r0 + 78 * 16], m1
12234
12235; mode 3 [row 8]
12236movu m0, [r4 + 8]
12237movd m1, [r4 + 24]
12238palignr m1, m0, 1
12239punpcklbw m0, m1
12240pmaddubsw m1, m0, [r5 + 10 * 16]
12241pmulhrsw m1, m3
12242movu m2, [r4 + 16]
12243psrldq m4, m2, 1
12244pinsrb m4, [r4 + 32], 15
12245punpcklbw m2, m4
12246pmaddubsw m4, m2, [r5 + 10 * 16]
12247pmulhrsw m4, m3
12248packuswb m1, m4
12249movu [r0 + 24 * 16], m1
12250
12251; mode 4 row [row 10]
12252pmaddubsw m1, m0, [r5 + 7 * 16]
12253pmulhrsw m1, m3
12254pmaddubsw m4, m2, [r5 + 7 * 16]
12255pmulhrsw m4, m3
12256packuswb m1, m4
12257movu [r0 + 42 * 16], m1
12258
12259; mode 4 row [row 11]
12260pmaddubsw m1, m0, [r5 + 28 * 16]
12261pmulhrsw m1, m3
12262pmaddubsw m4, m2, [r5 + 28 * 16]
12263pmulhrsw m4, m3
12264packuswb m1, m4
12265movu [r0 + 43 * 16], m1
12266
12267; mode 5 row [row 13]
12268pmaddubsw m1, m0, [r5 + 14 * 16]
12269pmulhrsw m1, m3
12270pmaddubsw m4, m2, [r5 + 14 * 16]
12271pmulhrsw m4, m3
12272packuswb m1, m4
12273movu [r0 + 61 * 16], m1
12274
12275; mode 5 row [row 14]
12276pmaddubsw m1, m0, [r5 + 31 * 16]
12277pmulhrsw m1, m3
12278pmaddubsw m4, m2, [r5 + 31 * 16]
12279pmulhrsw m4, m3
12280packuswb m1, m4
12281movu [r0 + 62 * 16], m1
12282
12283; mode 3 [row 9]
12284movu m0, [r4 + 9]
12285movd m1, [r4 + 16]
12286palignr m1, m0, 1
12287punpcklbw m0, m1
12288pmaddubsw m1, m0, [r5 + 4 * 16]
12289pmulhrsw m1, m3
12290movu m2, [r4 + 17]
12291movd m4, [r4 + 33]
12292palignr m4, m2, 1
12293punpcklbw m2, m4
12294pmaddubsw m4, m2, [r5 + 4 * 16]
12295pmulhrsw m4, m3
12296packuswb m1, m4
12297movu [r0 + 25 * 16], m1
12298
12299; mode 4 row [row 12]
12300pmaddubsw m1, m0, [r5 + 17 * 16]
12301pmulhrsw m1, m3
12302pmaddubsw m4, m2, [r5 + 17 * 16]
12303pmulhrsw m4, m3
12304packuswb m1, m4
12305movu [r0 + 44 * 16], m1
12306
12307; mode 3 [row 10]
12308pmaddubsw m1, m0, [r5 + 30 * 16]
12309pmulhrsw m1, m3
12310pmaddubsw m4, m2, [r5 + 30 * 16]
12311pmulhrsw m4, m3
12312packuswb m1, m4
12313movu [r0 + 26 * 16], m1
12314
12315; mode 5 row [row 15]
12316pmaddubsw m1, m0, [r5 + 16 * 16]
12317pmulhrsw m1, m3
12318pmaddubsw m4, m2, [r5 + 16 * 16]
12319pmulhrsw m4, m3
12320packuswb m1, m4
12321movu [r0 + 63 * 16], m1
12322
12323; mode 3 [row 11]
12324movu m0, [r4 + 10]
12325movd m1, [r4 + 26]
12326palignr m1, m0, 1
12327punpcklbw m0, m1
12328pmaddubsw m1, m0, [r5 + 24 * 16]
12329pmulhrsw m1, m3
12330movu m2, [r4 + 18]
12331movd m4, [r4 + 34]
12332palignr m4, m2, 1
12333punpcklbw m2, m4
12334pmaddubsw m4, m2, [r5 + 24 * 16]
12335pmulhrsw m4, m3
12336packuswb m1, m4
12337movu [r0 + 27 * 16], m1
12338
12339; mode 4 row [row 13]
12340pmaddubsw m1, m0, [r5 + 6 * 16]
12341pmulhrsw m1, m3
12342pmaddubsw m4, m2, [r5 + 6 * 16]
12343pmulhrsw m4, m3
12344packuswb m1, m4
12345movu [r0 + 45 * 16], m1
12346
12347; mode 4 row [row 14]
12348pmaddubsw m1, m0, [r5 + 27 * 16]
12349pmulhrsw m1, m3
12350pmaddubsw m4, m2, [r5 + 27 * 16]
12351pmulhrsw m4, m3
12352packuswb m1, m4
12353movu [r0 + 46 * 16], m1
12354
12355; mode 3 [row 12]
12356movu m0, [r4 + 11]
12357movd m1, [r4 + 27]
12358palignr m1, m0, 1
12359punpcklbw m0, m1
12360pmaddubsw m1, m0, [r5 + 18 * 16]
12361pmulhrsw m1, m3
12362movu m2, [r4 + 19]
12363movd m4, [r4 + 35]
12364palignr m4, m2, 1
12365punpcklbw m2, m4
12366pmaddubsw m4, m2, [r5 + 18 * 16]
12367pmulhrsw m4, m3
12368packuswb m1, m4
12369movu [r0 + 28 * 16], m1
12370
12371; mode 4 row [row 15]
12372pmaddubsw m1, m0, [r5 + 16 * 16]
12373pmulhrsw m1, m3
12374pmaddubsw m4, m2, [r5 + 16 * 16]
12375pmulhrsw m4, m3
12376packuswb m1, m4
12377movu [r0 + 47 * 16], m1
12378
12379; mode 3 [row 13]
12380movu m0, [r4 + 12]
12381movd m1, [r4 + 28]
12382palignr m1, m0, 1
12383punpcklbw m0, m1
12384pmaddubsw m1, m0, [r5 + 12 * 16]
12385pmulhrsw m1, m3
12386movu m2, [r4 + 20]
12387movd m4, [r4 + 36]
12388palignr m4, m2, 1
12389punpcklbw m2, m4
12390pmaddubsw m4, m2, [r5 + 12 * 16]
12391pmulhrsw m4, m3
12392packuswb m1, m4
12393movu [r0 + 29 * 16], m1
12394
12395; mode 3 [row 14]
12396movu m0, [r4 + 13]
12397movd m1, [r4 + 29]
12398palignr m1, m0, 1
12399punpcklbw m0, m1
12400pmaddubsw m1, m0, [r5 + 6 * 16]
12401pmulhrsw m1, m3
12402movu m2, [r4 + 21]
12403movd m4, [r4 + 37]
12404palignr m4, m2, 1
12405punpcklbw m2, m4
12406pmaddubsw m4, m2, [r5 + 6 * 16]
12407pmulhrsw m4, m3
12408packuswb m1, m4
12409movu [r0 + 30 * 16], m1
12410
12411; mode 9
12412movu m0, [r2 + 1]
12413movd m1, [r2 + 17]
12414palignr m1, m0, 1
12415
12416; mode 9 [row 15]
12417movu [r0 + 127 * 16], m1
12418
12419; mode 9 [row 0]
12420punpcklbw m0, m1
12421pmaddubsw m1, m0, [r5 + 2 * 16]
12422pmulhrsw m1, m3
12423movu m7, [r2 + 9]
12424movd m4, [r4 + 25]
12425palignr m2, m7, 1
12426punpcklbw m7, m2
12427pmaddubsw m2, m7, [r5 + 2 * 16]
12428pmulhrsw m2, m3
12429packuswb m1, m2
12430movu [r0 + 112 * 16], m1
12431
12432; mode 9 [row 1]
12433pmaddubsw m1, m0, [r5 + 4 * 16]
12434pmulhrsw m1, m3
12435pmaddubsw m2, m7, [r5 + 4 * 16]
12436pmulhrsw m2, m3
12437packuswb m1, m2
12438movu [r0 + 113 * 16], m1
12439
12440; mode 9 [row 2]
12441pmaddubsw m1, m0, [r5 + 6 * 16]
12442pmulhrsw m1, m3
12443pmaddubsw m2, m7, [r5 + 6 * 16]
12444pmulhrsw m2, m3
12445packuswb m1, m2
12446movu [r0 + 114 * 16], m1
12447
12448; mode 9 [row 3]
12449pmaddubsw m1, m0, [r5 + 8 * 16]
12450pmulhrsw m1, m3
12451pmaddubsw m2, m7, [r5 + 8 * 16]
12452pmulhrsw m2, m3
12453packuswb m1, m2
12454movu [r0 + 115 * 16], m1
12455
12456; mode 9 [row 4]
12457pmaddubsw m1, m0, [r5 + 10 * 16]
12458pmulhrsw m1, m3
12459pmaddubsw m2, m7, [r5 + 10 * 16]
12460pmulhrsw m2, m3
12461packuswb m1, m2
12462movu [r0 + 116 * 16], m1
12463
12464; mode 9 [row 5]
12465pmaddubsw m1, m0, [r5 + 12 * 16]
12466pmulhrsw m1, m3
12467pmaddubsw m2, m7, [r5 + 12 * 16]
12468pmulhrsw m2, m3
12469packuswb m1, m2
12470movu [r0 + 117 * 16], m1
12471
12472; mode 9 [row 6]
12473pmaddubsw m1, m0, [r5 + 14 * 16]
12474pmulhrsw m1, m3
12475pmaddubsw m2, m7, [r5 + 14 * 16]
12476pmulhrsw m2, m3
12477packuswb m1, m2
12478movu [r0 + 118 * 16], m1
12479
12480; mode 9 [row 7]
12481pmaddubsw m1, m0, [r5 + 16 * 16]
12482pmulhrsw m1, m3
12483pmaddubsw m2, m7, [r5 + 16 * 16]
12484pmulhrsw m2, m3
12485packuswb m1, m2
12486movu [r0 + 119 * 16], m1
12487
12488; mode 9 [row 8]
12489pmaddubsw m1, m0, [r5 + 18 * 16]
12490pmulhrsw m1, m3
12491pmaddubsw m2, m7, [r5 + 18 * 16]
12492pmulhrsw m2, m3
12493packuswb m1, m2
12494movu [r0 + 120 * 16], m1
12495
12496; mode 9 [row 9]
12497pmaddubsw m1, m0, [r5 + 20 * 16]
12498pmulhrsw m1, m3
12499pmaddubsw m2, m7, [r5 + 20 * 16]
12500pmulhrsw m2, m3
12501packuswb m1, m2
12502movu [r0 + 121 * 16], m1
12503
12504; mode 9 [row 10]
12505pmaddubsw m1, m0, [r5 + 22 * 16]
12506pmulhrsw m1, m3
12507pmaddubsw m2, m7, [r5 + 22 * 16]
12508pmulhrsw m2, m3
12509packuswb m1, m2
12510movu [r0 + 122 * 16], m1
12511
12512; mode 9 [row 11]
12513pmaddubsw m1, m0, [r5 + 24 * 16]
12514pmulhrsw m1, m3
12515pmaddubsw m2, m7, [r5 + 24 * 16]
12516pmulhrsw m2, m3
12517packuswb m1, m2
12518movu [r0 + 123 * 16], m1
12519
12520; mode 9 [row 12]
12521pmaddubsw m1, m0, [r5 + 26 * 16]
12522pmulhrsw m1, m3
12523pmaddubsw m2, m7, [r5 + 26 * 16]
12524pmulhrsw m2, m3
12525packuswb m1, m2
12526movu [r0 + 124 * 16], m1
12527
12528; mode 9 [row 13]
12529pmaddubsw m1, m0, [r5 + 28 * 16]
12530pmulhrsw m1, m3
12531pmaddubsw m2, m7, [r5 + 28 * 16]
12532pmulhrsw m2, m3
12533packuswb m1, m2
12534movu [r0 + 125 * 16], m1
12535
12536; mode 9 [row 14]
12537pmaddubsw m1, m0, [r5 + 30 * 16]
12538pmulhrsw m1, m3
12539pmaddubsw m2, m7, [r5 + 30 * 16]
12540pmulhrsw m2, m3
12541packuswb m1, m2
12542movu [r0 + 126 * 16], m1
12543
12544; mode 10
12545movu m1, [r2 + 1]
12546movu [r0 + 128 * 16], m1
12547movu [r0 + 129 * 16], m1
12548movu [r0 + 130 * 16], m1
12549movu [r0 + 131 * 16], m1
12550movu [r0 + 132 * 16], m1
12551movu [r0 + 133 * 16], m1
12552movu [r0 + 134 * 16], m1
12553movu [r0 + 135 * 16], m1
12554movu [r0 + 136 * 16], m1
12555movu [r0 + 137 * 16], m1
12556movu [r0 + 138 * 16], m1
12557movu [r0 + 139 * 16], m1
12558movu [r0 + 140 * 16], m1
12559movu [r0 + 141 * 16], m1
12560movu [r0 + 142 * 16], m1
12561movu [r0 + 143 * 16], m1
12562
12563pxor m0, m0
12564pshufb m1, m1, m0
12565punpcklbw m1, m0
12566movu m2, [r1]
12567pshufb m2, m2, m0
12568punpcklbw m2, m0
12569movu m4, [r1 + 1]
12570punpcklbw m5, m4, m0
12571punpckhbw m4, m0
12572psubw m5, m2
12573psubw m4, m2
12574psraw m5, 1
12575psraw m4, 1
12576paddw m5, m1
12577paddw m4, m1
12578packuswb m5, m4
12579
12580pextrb [r0 + 128 * 16], m5, 0
12581pextrb [r0 + 129 * 16], m5, 1
12582pextrb [r0 + 130 * 16], m5, 2
12583pextrb [r0 + 131 * 16], m5, 3
12584pextrb [r0 + 132 * 16], m5, 4
12585pextrb [r0 + 133 * 16], m5, 5
12586pextrb [r0 + 134 * 16], m5, 6
12587pextrb [r0 + 135 * 16], m5, 7
12588pextrb [r0 + 136 * 16], m5, 8
12589pextrb [r0 + 137 * 16], m5, 9
12590pextrb [r0 + 138 * 16], m5, 10
12591pextrb [r0 + 139 * 16], m5, 11
12592pextrb [r0 + 140 * 16], m5, 12
12593pextrb [r0 + 141 * 16], m5, 13
12594pextrb [r0 + 142 * 16], m5, 14
12595pextrb [r0 + 143 * 16], m5, 15
12596
12597; mode 11
12598movu m0, [r2]
12599
12600; mode 11 [row 15]
12601movu [r0 + 159 * 16], m0
12602
12603; mode 11 [row 0]
12604movu m1, [r2 + 1]
12605punpcklbw m0, m1
12606pmaddubsw m1, m0, [r5 + 30 * 16]
12607pmulhrsw m1, m3
12608movu m7, [r2 + 8]
12609movu m2, [r2 + 9]
12610punpcklbw m7, m2
12611pmaddubsw m2, m7, [r5 + 30 * 16]
12612pmulhrsw m2, m3
12613packuswb m1, m2
12614movu [r0 + 144 * 16], m1
12615
12616; mode 11 [row 1]
12617pmaddubsw m1, m0, [r5 + 28 * 16]
12618pmulhrsw m1, m3
12619pmaddubsw m2, m7, [r5 + 28 * 16]
12620pmulhrsw m2, m3
12621packuswb m1, m2
12622movu [r0 + 145 * 16], m1
12623
12624; mode 11 [row 2]
12625pmaddubsw m1, m0, [r5 + 26 * 16]
12626pmulhrsw m1, m3
12627pmaddubsw m2, m7, [r5 + 26 * 16]
12628pmulhrsw m2, m3
12629packuswb m1, m2
12630movu [r0 + 146 * 16], m1
12631
12632; mode 11 [row 3]
12633pmaddubsw m1, m0, [r5 + 24 * 16]
12634pmulhrsw m1, m3
12635pmaddubsw m2, m7, [r5 + 24 * 16]
12636pmulhrsw m2, m3
12637packuswb m1, m2
12638movu [r0 + 147 * 16], m1
12639
12640; mode 11 [row 4]
12641pmaddubsw m1, m0, [r5 + 22 * 16]
12642pmulhrsw m1, m3
12643pmaddubsw m2, m7, [r5 + 22 * 16]
12644pmulhrsw m2, m3
12645packuswb m1, m2
12646movu [r0 + 148 * 16], m1
12647
12648; mode 11 [row 5]
12649pmaddubsw m1, m0, [r5 + 20 * 16]
12650pmulhrsw m1, m3
12651pmaddubsw m2, m7, [r5 + 20 * 16]
12652pmulhrsw m2, m3
12653packuswb m1, m2
12654movu [r0 + 149 * 16], m1
12655
12656; mode 11 [row 6]
12657pmaddubsw m1, m0, [r5 + 18 * 16]
12658pmulhrsw m1, m3
12659pmaddubsw m2, m7, [r5 + 18 * 16]
12660pmulhrsw m2, m3
12661packuswb m1, m2
12662movu [r0 + 150 * 16], m1
12663
12664; mode 11 [row 7]
12665pmaddubsw m1, m0, [r5 + 16 * 16]
12666pmulhrsw m1, m3
12667pmaddubsw m2, m7, [r5 + 16 * 16]
12668pmulhrsw m2, m3
12669packuswb m1, m2
12670movu [r0 + 151 * 16], m1
12671
12672; mode 11 [row 8]
12673pmaddubsw m1, m0, [r5 + 14 * 16]
12674pmulhrsw m1, m3
12675pmaddubsw m2, m7, [r5 + 14 * 16]
12676pmulhrsw m2, m3
12677packuswb m1, m2
12678movu [r0 + 152 * 16], m1
12679
12680; mode 11 [row 9]
12681pmaddubsw m1, m0, [r5 + 12 * 16]
12682pmulhrsw m1, m3
12683pmaddubsw m2, m7, [r5 + 12 * 16]
12684pmulhrsw m2, m3
12685packuswb m1, m2
12686movu [r0 + 153 * 16], m1
12687
12688; mode 11 [row 10]
12689pmaddubsw m1, m0, [r5 + 10 * 16]
12690pmulhrsw m1, m3
12691pmaddubsw m2, m7, [r5 + 10 * 16]
12692pmulhrsw m2, m3
12693packuswb m1, m2
12694movu [r0 + 154 * 16], m1
12695
12696; mode 11 [row 11]
12697pmaddubsw m1, m0, [r5 + 8 * 16]
12698pmulhrsw m1, m3
12699pmaddubsw m2, m7, [r5 + 8 * 16]
12700pmulhrsw m2, m3
12701packuswb m1, m2
12702movu [r0 + 155 * 16], m1
12703
12704; mode 11 [row 12]
12705pmaddubsw m1, m0, [r5 + 6 * 16]
12706pmulhrsw m1, m3
12707pmaddubsw m2, m7, [r5 + 6 * 16]
12708pmulhrsw m2, m3
12709packuswb m1, m2
12710movu [r0 + 156 * 16], m1
12711
12712; mode 11 [row 13]
12713pmaddubsw m1, m0, [r5 + 4 * 16]
12714pmulhrsw m1, m3
12715pmaddubsw m2, m7, [r5 + 4 * 16]
12716pmulhrsw m2, m3
12717packuswb m1, m2
12718movu [r0 + 157 * 16], m1
12719
12720; mode 11 [row 14]
12721pmaddubsw m1, m0, [r5 + 2 * 16]
12722pmulhrsw m1, m3
12723pmaddubsw m2, m7, [r5 + 2 * 16]
12724pmulhrsw m2, m3
12725packuswb m1, m2
12726movu [r0 + 158 * 16], m1
12727
12728; mode 12 [row 0]
12729movu m0, [r4]
12730movu m1, [r4 + 1]
12731punpcklbw m0, m1
12732pmaddubsw m1, m0, [r5 + 27 * 16]
12733pmulhrsw m1, m3
12734movu m7, [r4 + 8]
12735movd m2, [r4 + 24]
12736palignr m2, m7, 1
12737punpcklbw m7, m2
12738pmaddubsw m2, m7, [r5 + 27 * 16]
12739pmulhrsw m2, m3
12740packuswb m1, m2
12741movu [r0 + 160 * 16], m1
12742
12743; mode 12 [row 1]
12744pmaddubsw m1, m0, [r5 + 22 * 16]
12745pmulhrsw m1, m3
12746pmaddubsw m2, m7, [r5 + 22 * 16]
12747pmulhrsw m2, m3
12748packuswb m1, m2
12749movu [r0 + 161 * 16], m1
12750
12751; mode 12 [row 2]
12752pmaddubsw m1, m0, [r5 + 17 * 16]
12753pmulhrsw m1, m3
12754pmaddubsw m2, m7, [r5 + 17 * 16]
12755pmulhrsw m2, m3
12756packuswb m1, m2
12757movu [r0 + 162 * 16], m1
12758
12759; mode 12 [row 3]
12760pmaddubsw m1, m0, [r5 + 12 * 16]
12761pmulhrsw m1, m3
12762pmaddubsw m2, m7, [r5 + 12 * 16]
12763pmulhrsw m2, m3
12764packuswb m1, m2
12765movu [r0 + 163 * 16], m1
12766
12767; mode 12 [row 4]
12768pmaddubsw m1, m0, [r5 + 7 * 16]
12769pmulhrsw m1, m3
12770pmaddubsw m2, m7, [r5 + 7 * 16]
12771pmulhrsw m2, m3
12772packuswb m1, m2
12773movu [r0 + 164 * 16], m1
12774
12775; mode 12 [row 5]
12776pmaddubsw m1, m0, [r5 + 2 * 16]
12777pmulhrsw m1, m3
12778pmaddubsw m2, m7, [r5 + 2 * 16]
12779pmulhrsw m2, m3
12780packuswb m1, m2
12781movu [r0 + 165 * 16], m1
12782
12783; mode 13 [row 0]
12784pmaddubsw m1, m0, [r5 + 23 * 16]
12785pmulhrsw m1, m3
12786pmaddubsw m2, m7, [r5 + 23 * 16]
12787pmulhrsw m2, m3
12788packuswb m1, m2
12789movu [r0 + 176 * 16], m1
12790
12791; mode 13 [row 1]
12792pmaddubsw m1, m0, [r5 + 14 * 16]
12793pmulhrsw m1, m3
12794pmaddubsw m2, m7, [r5 + 14 * 16]
12795pmulhrsw m2, m3
12796packuswb m1, m2
12797movu [r0 + 177 * 16], m1
12798
12799; mode 13 [row 2]
12800pmaddubsw m1, m0, [r5 + 5 * 16]
12801pmulhrsw m1, m3
12802pmaddubsw m2, m7, [r5 + 5 * 16]
12803pmulhrsw m2, m3
12804packuswb m1, m2
12805movu [r0 + 178 * 16], m1
12806
12807; mode 14 [row 0]
12808pmaddubsw m1, m0, [r5 + 19 * 16]
12809pmulhrsw m1, m3
12810pmaddubsw m2, m7, [r5 + 19 * 16]
12811pmulhrsw m2, m3
12812packuswb m1, m2
12813movu [r0 + 192 * 16], m1
12814
12815; mode 14 [row 1]
12816pmaddubsw m1, m0, [r5 + 6 * 16]
12817pmulhrsw m1, m3
12818pmaddubsw m2, m7, [r5 + 6 * 16]
12819pmulhrsw m2, m3
12820packuswb m1, m2
12821movu [r0 + 193 * 16], m1
12822
12823; mode 17 [row 0]
12824movu [r0 + 240 * 16], m1
12825
12826; mode 15 [row 0]
12827pmaddubsw m1, m0, [r5 + 15 * 16]
12828pmulhrsw m1, m3
12829pmaddubsw m2, m7, [r5 + 15 * 16]
12830pmulhrsw m2, m3
12831packuswb m1, m2
12832movu [r0 + 208 * 16], m1
12833
12834; mode 15 [row 15 - second half]
12835pmaddubsw m1, m0, [r5 + 16 * 16]
12836pmulhrsw m1, m3
12837packuswb m1, m1
12838movh [r0 + 223 * 16 + 8], m1
12839; mode 15 [row 15 - second half] end
12840
12841; mode 16 [row 0]
12842pmaddubsw m1, m0, [r5 + 11 * 16]
12843pmulhrsw m1, m3
12844pmaddubsw m2, m7, [r5 + 11 * 16]
12845pmulhrsw m2, m3
12846packuswb m1, m2
12847movu [r0 + 224 * 16], m1
12848
12849; mode 17 [row 9 - second half]
12850pmaddubsw m1, m0, [r5 + 28 * 16]
12851pmulhrsw m1, m3
12852packuswb m1, m1
12853movh [r0 + 249 * 16 + 8], m1
12854; mode 17 [row 9 - second half] end
12855
12856; mode 17 [row 10 - second half]
12857pmaddubsw m1, m0, [r5 + 2 * 16]
12858pmulhrsw m1, m3
12859packuswb m1, m1
12860movh [r0 + 250 * 16 + 8], m1
12861; mode 17 [row 10 - second half] end
12862
12863; mode 17 [row 1 - first half]
12864pslldq m6, m0, 2
12865pinsrb m6, [r3 + 0], 1
12866pinsrb m6, [r3 + 1], 0
12867pmaddubsw m1, m6, [r5 + 12 * 16]
12868pmulhrsw m1, m3
12869packuswb m1, m1
12870movh [r0 + 241 * 16], m1
12871
12872; mode 17 [row 11 - second half]
12873pmaddubsw m1, m6, [r5 + 8 * 16]
12874pmulhrsw m1, m3
12875packuswb m1, m1
12876movh [r0 + 251 * 16 + 8], m1
12877; mode 17 [row 11 - second half] end
12878
12879; mode 17 [row 2 - first half]
12880pslldq m6, 2
12881pinsrb m6, [r3 + 1], 1
12882pinsrb m6, [r3 + 2], 0
12883pmaddubsw m1, m6, [r5 + 18 * 16]
12884pmulhrsw m1, m3
12885packuswb m1, m1
12886movh [r0 + 242 * 16], m1
12887
12888; mode 17 [row 12 - second half]
12889pmaddubsw m1, m6, [r5 + 14 * 16]
12890pmulhrsw m1, m3
12891packuswb m1, m1
12892movh [r0 + 252 * 16 + 8], m1
12893; mode 17 [row 12 - second half] end
12894
12895; mode 17 [row 3 - first half]
12896pslldq m6, 2
12897pinsrb m6, [r3 + 2], 1
12898pinsrb m6, [r3 + 4], 0
12899pmaddubsw m1, m6, [r5 + 24 * 16]
12900pmulhrsw m1, m3
12901packuswb m1, m1
12902movh [r0 + 243 * 16], m1
12903
12904; mode 17 [row 13 - first half]
12905pmaddubsw m1, m6, [r5 + 20 * 16]
12906pmulhrsw m1, m3
12907packuswb m1, m1
12908movh [r0 + 253 * 16 + 8], m1
12909
12910; mode 17 [row 4 - first half]
12911pslldq m6, 2
12912pinsrb m6, [r3 + 4], 1
12913pinsrb m6, [r3 + 5], 0
12914pmaddubsw m1, m6, [r5 + 30 * 16]
12915pmulhrsw m1, m3
12916packuswb m1, m1
12917movh [r0 + 244 * 16], m1
12918
12919; mode 17 [row 5 - first half]
12920pmaddubsw m1, m6, [r5 + 4 * 16]
12921pmulhrsw m1, m3
12922packuswb m1, m1
12923movh [r0 + 245 * 16], m1
12924
12925; mode 17 [row 14 - second half]
12926pmaddubsw m1, m6, [r5 + 26 * 16]
12927pmulhrsw m1, m3
12928packuswb m1, m1
12929movh [r0 + 254 * 16 + 8], m1
12930; mode 17 [row 14 - second half] end
12931
12932; mode 17 [row 6 - first half]
12933pslldq m6, 2
12934pinsrb m6, [r3 + 5], 1
12935pinsrb m6, [r3 + 6], 0
12936pmaddubsw m1, m6, [r5 + 10 * 16]
12937pmulhrsw m1, m3
12938packuswb m1, m1
12939movh [r0 + 246 * 16], m1
12940
12941; mode 17 [row 7 - first half]
12942pslldq m6, 2
12943pinsrb m6, [r3 + 6], 1
12944pinsrb m6, [r3 + 7], 0
12945pmaddubsw m1, m6, [r5 + 16 * 16]
12946pmulhrsw m1, m3
12947packuswb m1, m1
12948movh [r0 + 247 * 16], m1
12949
12950; mode 17 [row 8 - first half]
12951pslldq m6, 2
12952pinsrb m6, [r3 + 7], 1
12953pinsrb m6, [r3 + 9], 0
12954pmaddubsw m1, m6, [r5 + 22 * 16]
12955pmulhrsw m1, m3
12956packuswb m1, m1
12957movh [r0 + 248 * 16], m1
12958
12959; mode 17 [row 9 - first half]
12960pslldq m6, 2
12961pinsrb m6, [r3 + 9], 1
12962pinsrb m6, [r3 + 10], 0
12963pmaddubsw m1, m6, [r5 + 28 * 16]
12964pmulhrsw m1, m3
12965packuswb m1, m1
12966movh [r0 + 249 * 16], m1
12967
12968; mode 17 [row 10 - first half]
12969pmaddubsw m1, m6, [r5 + 2 * 16]
12970pmulhrsw m1, m3
12971packuswb m1, m1
12972movh [r0 + 250 * 16], m1
12973
12974; mode 17 [row 11 - first half]
12975pslldq m6, 2
12976pinsrb m6, [r3 + 10], 1
12977pinsrb m6, [r3 + 11], 0
12978pmaddubsw m1, m6, [r5 + 8 * 16]
12979pmulhrsw m1, m3
12980packuswb m1, m1
12981movh [r0 + 251 * 16], m1
12982
12983; mode 17 [row 12 - first half]
12984pslldq m6, 2
12985pinsrb m6, [r3 + 11], 1
12986pinsrb m6, [r3 + 12], 0
12987pmaddubsw m1, m6, [r5 + 14 * 16]
12988pmulhrsw m1, m3
12989packuswb m1, m1
12990movh [r0 + 252 * 16], m1
12991
12992; mode 17 [row 13 - first half]
12993pslldq m6, 2
12994pinsrb m6, [r3 + 12], 1
12995pinsrb m6, [r3 + 14], 0
12996pmaddubsw m1, m6, [r5 + 20 * 16]
12997pmulhrsw m1, m3
12998packuswb m1, m1
12999movh [r0 + 253 * 16], m1
13000
13001; mode 17 [row 14 - first half]
13002pslldq m6, 2
13003pinsrb m6, [r3 + 14], 1
13004pinsrb m6, [r3 + 15], 0
13005pmaddubsw m1, m6, [r5 + 26 * 16]
13006pmulhrsw m1, m3
13007packuswb m1, m1
13008movh [r0 + 254 * 16], m1
13009
13010; mode 16 [row 12 - second half]
13011pmaddubsw m1, m0, [r5 + 15 * 16]
13012pmulhrsw m1, m3
13013packuswb m1, m1
13014movh [r0 + 236 * 16 + 8], m1
13015; mode 16 [row 12 - second half]
13016
13017; mode 12 [row 6]
13018pslldq m2, m0, 2
13019pinsrb m2, [r3 + 0], 1
13020pinsrb m2, [r3 + 6], 0
13021pmaddubsw m1, m2, [r5 + 29 * 16]
13022pmulhrsw m1, m3
13023movu m0, [r4 + 7]
13024psrldq m4, m0, 1
13025punpcklbw m0, m4
13026pmaddubsw m4, m0, [r5 + 29 * 16]
13027pmulhrsw m4, m3
13028packuswb m1, m4
13029movu [r0 + 166 * 16], m1
13030
13031; mode 12 [row 7]
13032pmaddubsw m1, m2, [r5 + 24 * 16]
13033pmulhrsw m1, m3
13034pmaddubsw m4, m0, [r5 + 24 * 16]
13035pmulhrsw m4, m3
13036packuswb m1, m4
13037movu [r0 + 167 * 16], m1
13038
13039; mode 12 [row 8]
13040pmaddubsw m1, m2, [r5 + 19 * 16]
13041pmulhrsw m1, m3
13042pmaddubsw m4, m0, [r5 + 19 * 16]
13043pmulhrsw m4, m3
13044packuswb m1, m4
13045movu [r0 + 168 * 16], m1
13046
13047; mode 12 [row 9]
13048pmaddubsw m1, m2, [r5 + 14 * 16]
13049pmulhrsw m1, m3
13050pmaddubsw m4, m0, [r5 + 14 * 16]
13051pmulhrsw m4, m3
13052packuswb m1, m4
13053movu [r0 + 169 * 16], m1
13054
13055; mode 12 [row 10]
13056pmaddubsw m1, m2, [r5 + 9 * 16]
13057pmulhrsw m1, m3
13058pmaddubsw m4, m0, [r5 + 9 * 16]
13059pmulhrsw m4, m3
13060packuswb m1, m4
13061movu [r0 + 170 * 16], m1
13062
13063; mode 12 [row 11]
13064pmaddubsw m1, m2, [r5 + 4 * 16]
13065pmulhrsw m1, m3
13066pmaddubsw m4, m0, [r5 + 4 * 16]
13067pmulhrsw m4, m3
13068packuswb m1, m4
13069movu [r0 + 171 * 16], m1
13070
13071; mode 13 [row 3]
13072pinsrb m7, m2, [r3 + 4], 0
13073pmaddubsw m1, m7, [r5 + 28 * 16]
13074pmulhrsw m1, m3
13075pmaddubsw m4, m0, [r5 + 28 * 16]
13076pmulhrsw m4, m3
13077packuswb m1, m4
13078movu [r0 + 179 * 16], m1
13079
13080; mode 13 [row 4]
13081pmaddubsw m1, m7, [r5 + 19 * 16]
13082pmulhrsw m1, m3
13083pmaddubsw m4, m0, [r5 + 19 * 16]
13084pmulhrsw m4, m3
13085packuswb m1, m4
13086movu [r0 + 180 * 16], m1
13087
13088; mode 13 [row 5]
13089pmaddubsw m1, m7, [r5 + 10 * 16]
13090pmulhrsw m1, m3
13091pmaddubsw m4, m0, [r5 + 10 * 16]
13092pmulhrsw m4, m3
13093packuswb m1, m4
13094movu [r0 + 181 * 16], m1
13095
13096; mode 13 [row 6]
13097pmaddubsw m1, m7, [r5 + 1 * 16]
13098pmulhrsw m1, m3
13099pmaddubsw m4, m0, [r5 + 1 * 16]
13100pmulhrsw m4, m3
13101packuswb m1, m4
13102movu [r0 + 182 * 16], m1
13103
13104; mode 14 [row 2]
13105pinsrb m5, m7, [r3 + 2], 0
13106pmaddubsw m1, m5, [r5 + 25 * 16]
13107pmulhrsw m1, m3
13108pmaddubsw m4, m0, [r5 + 25 * 16]
13109pmulhrsw m4, m3
13110packuswb m1, m4
13111movu [r0 + 194 * 16], m1
13112
13113; mode 14 [row 3]
13114pmaddubsw m1, m5, [r5 + 12 * 16]
13115pmulhrsw m1, m3
13116pmaddubsw m4, m0, [r5 + 12 * 16]
13117pmulhrsw m4, m3
13118packuswb m1, m4
13119movu [r0 + 195 * 16], m1
13120
13121; mode 15 [row 1]
13122pmaddubsw m1, m5, [r5 + 30 * 16]
13123pmulhrsw m1, m3
13124pmaddubsw m4, m0, [r5 + 30 * 16]
13125pmulhrsw m4, m3
13126packuswb m1, m4
13127movu [r0 + 209 * 16], m1
13128
13129; mode 15 [row 2]
13130pmaddubsw m1, m5, [r5 + 13 * 16]
13131pmulhrsw m1, m3
13132pmaddubsw m4, m0, [r5 + 13 * 16]
13133pmulhrsw m4, m3
13134packuswb m1, m4
13135movu [r0 + 210 * 16], m1
13136
13137; mode 16 [row 1]
13138pmaddubsw m1, m5, [r5 + 22 * 16]
13139pmulhrsw m1, m3
13140pmaddubsw m4, m0, [r5 + 22 * 16]
13141pmulhrsw m4, m3
13142packuswb m1, m4
13143movu [r0 + 225 * 16], m1
13144
13145; mode 16 [row 2]
13146pmaddubsw m1, m5, [r5 + 1 * 16]
13147pmulhrsw m1, m3
13148pmaddubsw m4, m0, [r5 + 1 * 16]
13149pmulhrsw m4, m3
13150packuswb m1, m4
13151movu [r0 + 226 * 16], m1
13152
13153; mode 16 [row 13 - second half]
13154pmaddubsw m1, m5, [r5 + 26 * 16]
13155pmulhrsw m1, m3
13156packuswb m1, m1
13157movh [r0 + 237 * 16 + 8], m1
13158; mode 16 [row 13 - second half]
13159
13160; mode 16 [row 14 - second half]
13161pmaddubsw m1, m5, [r5 + 5 * 16]
13162pmulhrsw m1, m3
13163packuswb m1, m1
13164movh [r0 + 238 * 16 + 8], m1
13165; mode 16 [row 14 - second half]
13166
13167; mode 16 [row 3]
13168pslldq m6, m5, 2
13169pinsrb m6, [r3 + 2], 1
13170pinsrb m6, [r3 + 3], 0
13171pmaddubsw m1, m6, [r5 + 12 * 16]
13172pmulhrsw m1, m3
13173packuswb m1, m1
13174movh [r0 + 227 * 16], m1
13175
13176; mode 16 [row 15 - second half]
13177pmaddubsw m1, m6, [r5 + 16 * 16]
13178pmulhrsw m1, m3
13179packuswb m1, m1
13180movh [r0 + 239 * 16 + 8], m1
13181; mode 16 [row 15 - second half] end
13182
13183; mode 16 [row 4- first half]
13184pslldq m6, 2
13185pinsrb m6, [r3 + 3], 1
13186pinsrb m6, [r3 + 5], 0
13187pmaddubsw m1, m6, [r5 + 23 * 16]
13188pmulhrsw m1, m3
13189packuswb m1, m1
13190movh [r0 + 228 * 16], m1
13191
13192; mode 16 [row 5- first half]
13193pmaddubsw m1, m6, [r5 + 2 * 16]
13194pmulhrsw m1, m3
13195packuswb m1, m1
13196movh [r0 + 229 * 16], m1
13197
13198; mode 16 [row 6- first half]
13199pslldq m6, 2
13200pinsrb m6, [r3 + 5], 1
13201pinsrb m6, [r3 + 6], 0
13202pmaddubsw m1, m6, [r5 + 13 * 16]
13203pmulhrsw m1, m3
13204packuswb m1, m1
13205movh [r0 + 230 * 16], m1
13206
13207; mode 16 [row 7- first half]
13208pslldq m6, 2
13209pinsrb m6, [r3 + 6], 1
13210pinsrb m6, [r3 + 8], 0
13211pmaddubsw m1, m6, [r5 + 24 * 16]
13212pmulhrsw m1, m3
13213packuswb m1, m1
13214movh [r0 + 231 * 16], m1
13215
13216; mode 16 [row 8- first half]
13217pmaddubsw m1, m6, [r5 + 3 * 16]
13218pmulhrsw m1, m3
13219packuswb m1, m1
13220movh [r0 + 232 * 16], m1
13221; mode 19 [row 0 - second half] end
13222
13223; mode 16 [row 9- first half]
13224pslldq m6, 2
13225pinsrb m6, [r3 + 8], 1
13226pinsrb m6, [r3 + 9], 0
13227pmaddubsw m1, m6, [r5 + 14 * 16]
13228pmulhrsw m1, m3
13229packuswb m1, m1
13230movh [r0 + 233 * 16], m1
13231
13232; mode 16 [row 10 - first half]
13233pslldq m6, 2
13234pinsrb m6, [r3 + 9], 1
13235pinsrb m6, [r3 + 11], 0
13236pmaddubsw m1, m6, [r5 + 25 * 16]
13237pmulhrsw m1, m3
13238packuswb m1, m1
13239movh [r0 + 234 * 16], m1
13240
13241; mode 16 [row 11 - first half]
13242pmaddubsw m1, m6, [r5 + 4 * 16]
13243pmulhrsw m1, m3
13244packuswb m1, m1
13245movh [r0 + 235 * 16], m1
13246
13247; mode 16 [row 12 - first half]
13248pslldq m6, 2
13249pinsrb m6, [r3 + 11], 1
13250pinsrb m6, [r3 + 12], 0
13251pmaddubsw m1, m6, [r5 + 15 * 16]
13252pmulhrsw m1, m3
13253packuswb m1, m1
13254movh [r0 + 236 * 16], m1
13255
13256; mode 16 [row 13 - first half]
13257pslldq m6, 2
13258pinsrb m6, [r3 + 12], 1
13259pinsrb m6, [r3 + 14], 0
13260pmaddubsw m1, m6, [r5 + 26 * 16]
13261pmulhrsw m1, m3
13262packuswb m1, m1
13263movh [r0 + 237 * 16], m1
13264
13265; mode 16 [row 14 - first half]
13266pmaddubsw m1, m6, [r5 + 5 * 16]
13267pmulhrsw m1, m3
13268packuswb m1, m1
13269movh [r0 + 238 * 16], m1
13270
13271; mode 16 [row 15 - first half]
13272pslldq m6, 2
13273pinsrb m6, [r3 + 14], 1
13274pinsrb m6, [r3 + 15], 0
13275pmaddubsw m1, m6, [r5 + 16 * 16]
13276pmulhrsw m1, m3
13277packuswb m1, m1
13278movh [r0 + 239 * 16], m1
13279
13280; mode 14 [row 4]
13281pslldq m5, 2
13282pinsrb m5, [r3 + 2], 1
13283pinsrb m5, [r3 + 5], 0
13284movu m4, [r4 + 6]
13285psrldq m0, m4, 1
13286punpcklbw m4, m0
13287
13288; mode 16 [row 3 - second half]
13289pmaddubsw m1, m4, [r5 + 12 * 16]
13290pmulhrsw m1, m3
13291packuswb m1, m1
13292movh [r0 + 227 * 16 + 8], m1
13293
13294; mode 16 [row 3 - second half] end
13295pmaddubsw m1, m5, [r5 + 31 * 16]
13296pmulhrsw m1, m3
13297pmaddubsw m0, m4, [r5 + 31 * 16]
13298pmulhrsw m0, m3
13299packuswb m1, m0
13300movu [r0 + 196 * 16], m1
13301
13302; mode 14 [row 5]
13303pmaddubsw m1, m5, [r5 + 18 * 16]
13304pmulhrsw m1, m3
13305pmaddubsw m0, m4, [r5 + 18 * 16]
13306pmulhrsw m0, m3
13307packuswb m1, m0
13308movu [r0 + 197 * 16], m1
13309
13310; mode 14 [row 6]
13311pmaddubsw m1, m5, [r5 + 5 * 16]
13312pmulhrsw m1, m3
13313pmaddubsw m0, m4, [r5 + 5 * 16]
13314pmulhrsw m0, m3
13315packuswb m1, m0
13316movu [r0 + 198 * 16], m1
13317
13318; mode 15 [row 3]
13319movu m6, m5
13320pinsrb m6, [r3 + 4], 0
13321pmaddubsw m1, m6, [r5 + 28 * 16]
13322pmulhrsw m1, m3
13323pmaddubsw m0, m4, [r5 + 28 * 16]
13324pmulhrsw m0, m3
13325packuswb m1, m0
13326movu [r0 + 211 * 16], m1
13327
13328; mode 15 [row 4]
13329pmaddubsw m1, m6, [r5 + 11 * 16]
13330pmulhrsw m1, m3
13331pmaddubsw m0, m4, [r5 + 11 * 16]
13332pmulhrsw m0, m3
13333packuswb m1, m0
13334movu [r0 + 212 * 16], m1
13335
13336; mode 15 [row 5 - first half]
13337pslldq m6, 2
13338pinsrb m6, [r3 + 4], 1
13339pinsrb m6, [r3 + 6], 0
13340pmaddubsw m1, m6, [r5 + 26 * 16]
13341pmulhrsw m1, m3
13342packuswb m1, m1
13343movh [r0 + 213 * 16], m1
13344
13345; mode 15 [row 6 - first half]
13346pmaddubsw m1, m6, [r5 + 9 * 16]
13347pmulhrsw m1, m3
13348packuswb m1, m1
13349movh [r0 + 214 * 16], m1
13350
13351; mode 15 [row 7 - first half]
13352pslldq m6, 2
13353pinsrb m6, [r3 + 6], 1
13354pinsrb m6, [r3 + 8], 0
13355pmaddubsw m1, m6, [r5 + 24 * 16]
13356pmulhrsw m1, m3
13357packuswb m1, m1
13358movh [r0 + 215 * 16], m1
13359
13360; mode 15 [row 8 - first half]
13361pmaddubsw m1, m6, [r5 + 7 * 16]
13362pmulhrsw m1, m3
13363packuswb m1, m1
13364movh [r0 + 216 * 16], m1
13365
13366; mode 15 [row 9 - first half]
13367pslldq m6, 2
13368pinsrb m6, [r3 + 8], 1
13369pinsrb m6, [r3 + 9], 0
13370pmaddubsw m1, m6, [r5 + 22 * 16]
13371pmulhrsw m1, m3
13372packuswb m1, m1
13373movh [r0 + 217 * 16], m1
13374
13375; mode 15 [row 10 - first half]
13376pmaddubsw m1, m6, [r5 + 5 * 16]
13377pmulhrsw m1, m3
13378packuswb m1, m1
13379movh [r0 + 218 * 16], m1
13380
13381; mode 15 [row 11 - first half]
13382pslldq m6, 2
13383pinsrb m6, [r3 + 9], 1
13384pinsrb m6, [r3 + 11], 0
13385pmaddubsw m1, m6, [r5 + 20 * 16]
13386pmulhrsw m1, m3
13387packuswb m1, m1
13388movh [r0 + 219 * 16], m1
13389
13390; mode 15 [row 12 - first half]
13391pmaddubsw m1, m6, [r5 + 3 * 16]
13392pmulhrsw m1, m3
13393packuswb m1, m1
13394movh [r0 + 220 * 16], m1
13395
13396; mode 15 [row 13 - first half]
13397pslldq m6, 2
13398pinsrb m6, [r3 + 11], 1
13399pinsrb m6, [r3 + 13], 0
13400pmaddubsw m1, m6, [r5 + 18 * 16]
13401pmulhrsw m1, m3
13402packuswb m1, m1
13403movh [r0 + 221 * 16], m1
13404
13405; mode 15 [row 14 - first half]
13406pmaddubsw m1, m6, [r5 + 1 * 16]
13407pmulhrsw m1, m3
13408packuswb m1, m1
13409movh [r0 + 222 * 16], m1
13410
13411; mode 15 [row 15 - first half]
13412pslldq m6, 2
13413pinsrb m6, [r3 + 13], 1
13414pinsrb m6, [r3 + 15], 0
13415pmaddubsw m1, m6, [r5 + 16 * 16]
13416pmulhrsw m1, m3
13417packuswb m1, m1
13418movh [r0 + 223 * 16], m1
13419
13420; mode 14 [row 7]
13421pslldq m5, 2
13422pinsrb m5, [r3 + 5], 1
13423pinsrb m5, [r3 + 7], 0
13424movu m0, [r4 + 5]
13425psrldq m6, m0, 1
13426punpcklbw m0, m6
13427
13428; mode 15 [row 5 - second half]
13429pmaddubsw m1, m0, [r5 + 26 * 16]
13430pmulhrsw m1, m3
13431packuswb m1, m1
13432movh [r0 + 213 * 16 + 8], m1
13433; mode 15 [row 5 - second half] end
13434
13435; mode 15 [row 6 - second half]
13436pmaddubsw m1, m0, [r5 + 9 * 16]
13437pmulhrsw m1, m3
13438packuswb m1, m1
13439movh [r0 + 214 * 16 + 8], m1
13440; mode 15 [row 6 - second half] end
13441
13442; mode 16 [row 4 - second half]
13443pmaddubsw m1, m0, [r5 + 23 * 16]
13444pmulhrsw m1, m3
13445packuswb m1, m1
13446movh [r0 + 228 * 16 + 8], m1
13447; mode 16 [row 4 - second half] end
13448
13449; mode 16 [row 5 - second half]
13450pmaddubsw m1, m0, [r5 + 2 * 16]
13451pmulhrsw m1, m3
13452packuswb m1, m1
13453movh [r0 + 229 * 16 + 8], m1
13454
13455; mode 16 [row 5 - second half] end
13456pmaddubsw m1, m5, [r5 + 24 * 16]
13457pmulhrsw m1, m3
13458pmaddubsw m6, m0, [r5 + 24 * 16]
13459pmulhrsw m6, m3
13460packuswb m1, m6
13461movu [r0 + 199 * 16], m1
13462
13463; mode 14 [row 8]
13464pmaddubsw m1, m5, [r5 + 11 * 16]
13465pmulhrsw m1, m3
13466pmaddubsw m6, m0, [r5 + 11 * 16]
13467pmulhrsw m6, m3
13468packuswb m1, m6
13469movu [r0 + 200 * 16], m1
13470
13471; mode 14 [row 9]
13472pslldq m5, 2
13473pinsrb m5, [r3 + 7], 1
13474pinsrb m5, [r3 + 10], 0
13475movu m0, [r4 + 4]
13476psrldq m6, m0, 1
13477punpcklbw m0, m6
13478
13479; mode 15 [row 7 - second half]
13480pmaddubsw m1, m0, [r5 + 24 * 16]
13481pmulhrsw m1, m3
13482packuswb m1, m1
13483movh [r0 + 215 * 16 + 8], m1
13484; mode 15 [row 7 - second half] end
13485
13486; mode 15 [row 8 - second half]
13487pmaddubsw m1, m0, [r5 + 7 * 16]
13488pmulhrsw m1, m3
13489packuswb m1, m1
13490movh [r0 + 216 * 16 + 8], m1
13491; mode 15 [row 8 - second half] end
13492
13493; mode 16 [row 6 - second half]
13494pmaddubsw m1, m0, [r5 + 13 * 16]
13495pmulhrsw m1, m3
13496packuswb m1, m1
13497movh [r0 + 230 * 16 + 8], m1
13498; mode 16 [row 6 - second half] end
13499
13500; mode 15 [row 6 - second half] end
13501pmaddubsw m1, m5, [r5 + 30 * 16]
13502pmulhrsw m1, m3
13503pmaddubsw m6, m0, [r5 + 30 * 16]
13504pmulhrsw m6, m3
13505packuswb m1, m6
13506movu [r0 + 201 * 16], m1
13507
13508; mode 14 [row 10]
13509pmaddubsw m1, m5, [r5 + 17 * 16]
13510pmulhrsw m1, m3
13511pmaddubsw m6, m0, [r5 + 17 * 16]
13512pmulhrsw m6, m3
13513packuswb m1, m6
13514movu [r0 + 202 * 16], m1
13515
13516; mode 14 [row 11]
13517pmaddubsw m1, m5, [r5 + 4 * 16]
13518pmulhrsw m1, m3
13519pmaddubsw m6, m0, [r5 + 4 * 16]
13520pmulhrsw m6, m3
13521packuswb m1, m6
13522movu [r0 + 203 * 16], m1
13523
13524; mode 14 [row 12]
13525pslldq m5, 2
13526pinsrb m5, [r3 + 10], 1
13527pinsrb m5, [r3 + 12], 0
13528movu m0, [r4 + 3]
13529psrldq m6, m0, 1
13530punpcklbw m0, m6
13531
13532; mode 15 [row 9 - second half]
13533pmaddubsw m1, m0, [r5 + 22 * 16]
13534pmulhrsw m1, m3
13535packuswb m1, m1
13536movh [r0 + 217 * 16 + 8], m1
13537; mode 15 [row 9 - second half] end
13538
13539; mode 15 [row 10 - second half]
13540pmaddubsw m1, m0, [r5 + 5 * 16]
13541pmulhrsw m1, m3
13542packuswb m1, m1
13543movh [r0 + 218 * 16 + 8], m1
13544; mode 15 [row 10 - second half] end
13545
13546; mode 16 [row 7 - second half]
13547pmaddubsw m1, m0, [r5 + 24 * 16]
13548pmulhrsw m1, m3
13549packuswb m1, m1
13550movh [r0 + 231 * 16 + 8], m1
13551; mode 16 [row 7 - second half] end
13552
13553; mode 16 [row 8 - second half]
13554pmaddubsw m1, m0, [r5 + 3 * 16]
13555pmulhrsw m1, m3
13556packuswb m1, m1
13557movh [r0 + 232 * 16 + 8], m1
13558; mode 16 [row 8 - second half] end
13559
13560pmaddubsw m1, m5, [r5 + 23 * 16]
13561pmulhrsw m1, m3
13562pmaddubsw m6, m0, [r5 + 23 * 16]
13563pmulhrsw m6, m3
13564packuswb m1, m6
13565movu [r0 + 204 * 16], m1
13566
13567; mode 14 [row 13]
13568pmaddubsw m1, m5, [r5 + 10 * 16]
13569pmulhrsw m1, m3
13570pmaddubsw m6, m0, [r5 + 10 * 16]
13571pmulhrsw m6, m3
13572packuswb m1, m6
13573movu [r0 + 205 * 16], m1
13574
13575; mode 14 [row 14]
13576pslldq m5, 2
13577pinsrb m5, [r3 + 12], 1
13578pinsrb m5, [r3 + 15], 0
13579movu m0, [r4 + 2]
13580psrldq m6, m0, 1
13581punpcklbw m0, m6
13582
13583; mode 15 [row 11 - second half]
13584pmaddubsw m1, m0, [r5 + 20 * 16]
13585pmulhrsw m1, m3
13586packuswb m1, m1
13587movh [r0 + 219 * 16 + 8], m1
13588; mode 15 [row 11 - second half] end
13589
13590; mode 15 [row 12 - second half]
13591pmaddubsw m1, m0, [r5 + 3 * 16]
13592pmulhrsw m1, m3
13593packuswb m1, m1
13594movh [r0 + 220 * 16 + 8], m1
13595; mode 15 [row 12 - second half] end
13596
13597; mode 16 [row 9 - second half]
13598pmaddubsw m1, m0, [r5 + 14 * 16]
13599pmulhrsw m1, m3
13600packuswb m1, m1
13601movh [r0 + 233 * 16 + 8], m1
13602
13603; mode 16 [row 9 - second half] end
13604pmaddubsw m1, m5, [r5 + 29 * 16]
13605pmulhrsw m1, m3
13606pmaddubsw m6, m0, [r5 + 29 * 16]
13607pmulhrsw m6, m3
13608packuswb m1, m6
13609movu [r0 + 206 * 16], m1
13610
13611; mode 14 [row 15]
13612pmaddubsw m1, m5, [r5 + 16 * 16]
13613pmulhrsw m1, m3
13614pmaddubsw m6, m0, [r5 + 16 * 16]
13615pmulhrsw m6, m3
13616packuswb m1, m6
13617movu [r0 + 207 * 16], m1
13618
13619; mode 12 [row 12]
13620pslldq m0, m2, 2
13621pinsrb m0, [r3 + 6], 1
13622pinsrb m0, [r3 + 13], 0
13623pmaddubsw m1, m0, [r5 + 31 * 16]
13624pmulhrsw m1, m3
13625pmaddubsw m5, m4, [r5 + 31 * 16]
13626pmulhrsw m5, m3
13627packuswb m1, m5
13628movu [r0 + 172 * 16], m1
13629
13630; mode 12 [row 13]
13631pmaddubsw m1, m0, [r5 + 26 * 16]
13632pmulhrsw m1, m3
13633pmaddubsw m5, m4, [r5 + 26 * 16]
13634pmulhrsw m5, m3
13635packuswb m1, m5
13636movu [r0 + 173 * 16], m1
13637
13638; mode 12 [row 14]
13639pmaddubsw m1, m0, [r5 + 21 * 16]
13640pmulhrsw m1, m3
13641pmaddubsw m5, m4, [r5 + 21 * 16]
13642pmulhrsw m5, m3
13643packuswb m1, m5
13644movu [r0 + 174 * 16], m1
13645
13646; mode 12 [row 15]
13647pmaddubsw m1, m0, [r5 + 16 * 16]
13648pmulhrsw m1, m3
13649pmaddubsw m5, m4, [r5 + 16 * 16]
13650pmulhrsw m5, m3
13651packuswb m1, m5
13652movu [r0 + 175 * 16], m1
13653
13654; mode 13 [row 7]
13655pslldq m7, 2
13656pinsrb m7, [r3 + 4], 1
13657pinsrb m7, [r3 + 7], 0
13658pmaddubsw m1, m7, [r5 + 24 * 16]
13659pmulhrsw m1, m3
13660pmaddubsw m5, m4, [r5 + 24 * 16]
13661pmulhrsw m5, m3
13662packuswb m1, m5
13663movu [r0 + 183 * 16], m1
13664
13665; mode 13 [row 8]
13666pmaddubsw m1, m7, [r5 + 15 * 16]
13667pmulhrsw m1, m3
13668pmaddubsw m5, m4, [r5 + 15 * 16]
13669pmulhrsw m5, m3
13670packuswb m1, m5
13671movu [r0 + 184 * 16], m1
13672
13673; mode 13 [row 9]
13674pmaddubsw m1, m7, [r5 + 6 * 16]
13675pmulhrsw m1, m3
13676pmaddubsw m5, m4, [r5 + 6 * 16]
13677pmulhrsw m5, m3
13678packuswb m1, m5
13679movu [r0 + 185 * 16], m1
13680
13681; mode 13 [row 10]
13682pslldq m7, 2
13683pinsrb m7, [r3 + 7], 1
13684pinsrb m7, [r3 + 11], 0
13685pmaddubsw m1, m7, [r5 + 29 * 16]
13686pmulhrsw m1, m3
13687movu m4, [r4 + 5]
13688psrldq m5, m4, 1
13689punpcklbw m4, m5
13690pmaddubsw m5, m4, [r5 + 29 * 16]
13691pmulhrsw m5, m3
13692packuswb m1, m5
13693movu [r0 + 186 * 16], m1
13694
13695; mode 13 [row 11]
13696pmaddubsw m1, m7, [r5 + 20 * 16]
13697pmulhrsw m1, m3
13698pmaddubsw m5, m4, [r5 + 20 * 16]
13699pmulhrsw m5, m3
13700packuswb m1, m5
13701movu [r0 + 187 * 16], m1
13702
13703; mode 13 [row 12]
13704pmaddubsw m1, m7, [r5 + 11 * 16]
13705pmulhrsw m1, m3
13706pmaddubsw m5, m4, [r5 + 11 * 16]
13707pmulhrsw m5, m3
13708packuswb m1, m5
13709movu [r0 + 188 * 16], m1
13710
13711; mode 13 [row 13]
13712pmaddubsw m1, m7, [r5 + 2 * 16]
13713pmulhrsw m1, m3
13714pmaddubsw m5, m4, [r5 + 2 * 16]
13715pmulhrsw m5, m3
13716packuswb m1, m5
13717movu [r0 + 189 * 16], m1
13718
13719; mode 13 [row 14]
13720pslldq m7, 2
13721pinsrb m7, [r3 + 11], 1
13722pinsrb m7, [r3 + 14], 0
13723pmaddubsw m1, m7, [r5 + 25 * 16]
13724pmulhrsw m1, m3
13725movu m4, [r4 + 4]
13726psrldq m5, m4, 1
13727punpcklbw m4, m5
13728pmaddubsw m5, m4, [r5 + 25 * 16]
13729pmulhrsw m5, m3
13730packuswb m1, m5
13731movu [r0 + 190 * 16], m1
13732
13733; mode 13 [row 15]
13734pmaddubsw m1, m7, [r5 + 16 * 16]
13735pmulhrsw m1, m3
13736pmaddubsw m5, m4, [r5 + 16 * 16]
13737pmulhrsw m5, m3
13738packuswb m1, m5
13739movu [r0 + 191 * 16], m1
13740
13741; mode 17 [row 15]
13742movu m0, [r3]
13743pshufb m1, m0, [tab_S1]
13744movu [r0 + 255 * 16], m1
13745movu m2, [r4]
13746movd [r0 + 255 * 16 + 12], m2
13747
13748; mode 18 [row 0]
13749movu [r0 + 256 * 16], m0
13750
13751; mode 18 [row 1]
13752pslldq m4, m0, 1
13753pinsrb m4, [r4 + 1], 0
13754movu [r0 + 257 * 16], m4
13755pslldq m4, 1
13756pinsrb m4, [r4 + 2], 0
13757movu [r0 + 258 * 16], m4
13758pslldq m4, 1
13759pinsrb m4, [r4 + 3], 0
13760movu [r0 + 259 * 16], m4
13761pslldq m4, 1
13762pinsrb m4, [r4 + 4], 0
13763movu [r0 + 260 * 16], m4
13764pslldq m4, 1
13765pinsrb m4, [r4 + 5], 0
13766movu [r0 + 261 * 16], m4
13767pslldq m4, 1
13768pinsrb m4, [r4 + 6], 0
13769movu [r0 + 262 * 16], m4
13770pslldq m4, 1
13771pinsrb m4, [r4 + 7], 0
13772movu [r0 + 263 * 16], m4
13773pslldq m4, 1
13774pinsrb m4, [r4 + 8], 0
13775movu [r0 + 264 * 16], m4
13776pslldq m4, 1
13777pinsrb m4, [r4 + 9], 0
13778movu [r0 + 265 * 16], m4
13779pslldq m4, 1
13780pinsrb m4, [r4 + 10], 0
13781movu [r0 + 266 * 16], m4
13782pslldq m4, 1
13783pinsrb m4, [r4 + 11], 0
13784movu [r0 + 267 * 16], m4
13785pslldq m4, 1
13786pinsrb m4, [r4 + 12], 0
13787movu [r0 + 268 * 16], m4
13788pslldq m4, 1
13789pinsrb m4, [r4 + 13], 0
13790movu [r0 + 269 * 16], m4
13791pslldq m4, 1
13792pinsrb m4, [r4 + 14], 0
13793movu [r0 + 270 * 16], m4
13794pslldq m4, 1
13795pinsrb m4, [r4 + 15], 0
13796movu [r0 + 271 * 16], m4
13797
13798; mode 19 [row 0]
13799psrldq m2, m0, 1
13800punpcklbw m0, m2
13801movu m5, [r3 + 8]
13802psrldq m6, m5, 1
13803punpcklbw m5, m6
13804pmaddubsw m4, m0, [r5 + 6 * 16]
13805pmulhrsw m4, m3
13806pmaddubsw m6, m5, [r5 + 6 * 16]
13807pmulhrsw m6, m3
13808packuswb m4, m6
13809movu [r0 + 272 * 16], m4
13810
13811; mode 20 [row 0]
13812pmaddubsw m4, m0, [r5 + 11 * 16]
13813pmulhrsw m4, m3
13814pmaddubsw m6, m5, [r5 + 11 * 16]
13815pmulhrsw m6, m3
13816packuswb m4, m6
13817movu [r0 + 288 * 16], m4
13818
13819; mode 21 [row 0]
13820pmaddubsw m4, m0, [r5 + 15 * 16]
13821pmulhrsw m4, m3
13822pmaddubsw m6, m5, [r5 + 15 * 16]
13823pmulhrsw m6, m3
13824packuswb m4, m6
13825movu [r0 + 304 * 16], m4
13826
13827; mode 22 [row 0]
13828pmaddubsw m4, m0, [r5 + 19 * 16]
13829pmulhrsw m4, m3
13830pmaddubsw m6, m5, [r5 + 19 * 16]
13831pmulhrsw m6, m3
13832packuswb m4, m6
13833movu [r0 + 320 * 16], m4
13834
13835; mode 22 [row 1]
13836pmaddubsw m4, m0, [r5 + 6 * 16]
13837pmulhrsw m4, m3
13838pmaddubsw m6, m5, [r5 + 6 * 16]
13839pmulhrsw m6, m3
13840packuswb m4, m6
13841movu [r0 + 321 * 16], m4
13842
13843; mode 23 [row 0]
13844pmaddubsw m4, m0, [r5 + 23 * 16]
13845pmulhrsw m4, m3
13846pmaddubsw m6, m5, [r5 + 23 * 16]
13847pmulhrsw m6, m3
13848packuswb m4, m6
13849movu [r0 + 336 * 16], m4
13850
13851; mode 23 [row 1]
13852pmaddubsw m4, m0, [r5 + 14 * 16]
13853pmulhrsw m4, m3
13854pmaddubsw m6, m5, [r5 + 14 * 16]
13855pmulhrsw m6, m3
13856packuswb m4, m6
13857movu [r0 + 337 * 16], m4
13858
13859; mode 23 [row 2]
13860pmaddubsw m4, m0, [r5 + 5 * 16]
13861pmulhrsw m4, m3
13862pmaddubsw m6, m5, [r5 + 5 * 16]
13863pmulhrsw m6, m3
13864packuswb m4, m6
13865movu [r0 + 338 * 16], m4
13866
13867; mode 24 [row 0]
13868pmaddubsw m4, m0, [r5 + 27 * 16]
13869pmulhrsw m4, m3
13870pmaddubsw m6, m5, [r5 + 27 * 16]
13871pmulhrsw m6, m3
13872packuswb m4, m6
13873movu [r0 + 352 * 16], m4
13874
13875; mode 24 [row 1]
13876pmaddubsw m4, m0, [r5 + 22 * 16]
13877pmulhrsw m4, m3
13878pmaddubsw m6, m5, [r5 + 22 * 16]
13879pmulhrsw m6, m3
13880packuswb m4, m6
13881movu [r0 + 353 * 16], m4
13882
13883; mode 24 [row 2]
13884pmaddubsw m4, m0, [r5 + 17 * 16]
13885pmulhrsw m4, m3
13886pmaddubsw m6, m5, [r5 + 17 * 16]
13887pmulhrsw m6, m3
13888packuswb m4, m6
13889movu [r0 + 354 * 16], m4
13890
13891; mode 24 [row 3]
13892pmaddubsw m4, m0, [r5 + 12 * 16]
13893pmulhrsw m4, m3
13894pmaddubsw m6, m5, [r5 + 12 * 16]
13895pmulhrsw m6, m3
13896packuswb m4, m6
13897movu [r0 + 355 * 16], m4
13898
13899; mode 24 [row 4]
13900pmaddubsw m4, m0, [r5 + 7 * 16]
13901pmulhrsw m4, m3
13902pmaddubsw m6, m5, [r5 + 7 * 16]
13903pmulhrsw m6, m3
13904packuswb m4, m6
13905movu [r0 + 356 * 16], m4
13906
13907; mode 24 [row 5]
13908pmaddubsw m4, m0, [r5 + 2 * 16]
13909pmulhrsw m4, m3
13910pmaddubsw m6, m5, [r5 + 2 * 16]
13911pmulhrsw m6, m3
13912packuswb m4, m6
13913movu [r0 + 357 * 16], m4
13914
13915; mode 24 [row 6 - first half]
13916pslldq m7, m0, 2
13917pinsrb m7, [r4 + 0], 1
13918pinsrb m7, [r4 + 6], 0
13919pmaddubsw m4, m7, [r5 + 29 * 16]
13920pmulhrsw m4, m3
13921packuswb m4, m4
13922movh [r0 + 358 * 16], m4
13923
13924; mode 24 [row 7 - first half]
13925pmaddubsw m4, m7, [r5 + 24 * 16]
13926pmulhrsw m4, m3
13927packuswb m4, m4
13928movh [r0 + 359 * 16], m4
13929
13930; mode 24 [row 8 - first half]
13931pmaddubsw m4, m7, [r5 + 19 * 16]
13932pmulhrsw m4, m3
13933packuswb m4, m4
13934movh [r0 + 360 * 16], m4
13935
13936; mode 24 [row 9 - first half]
13937pmaddubsw m4, m7, [r5 + 14 * 16]
13938pmulhrsw m4, m3
13939packuswb m4, m4
13940movh [r0 + 361 * 16], m4
13941
13942; mode 24 [row 10 - first half]
13943pmaddubsw m4, m7, [r5 + 9 * 16]
13944pmulhrsw m4, m3
13945packuswb m4, m4
13946movh [r0 + 362 * 16], m4
13947
13948; mode 24 [row 11 - first half]
13949pmaddubsw m4, m7, [r5 + 4 * 16]
13950pmulhrsw m4, m3
13951packuswb m4, m4
13952movh [r0 + 363 * 16], m4
13953
13954; mode 24 [row 12 - first half]
13955pslldq m7, 2
13956pinsrb m7, [r4 + 6], 1
13957pinsrb m7, [r4 + 13], 0
13958pmaddubsw m4, m7, [r5 + 31 * 16]
13959pmulhrsw m4, m3
13960packuswb m4, m4
13961movh [r0 + 364 * 16], m4
13962
13963; mode 24 [row 13 - first half]
13964pmaddubsw m4, m7, [r5 + 26 * 16]
13965pmulhrsw m4, m3
13966packuswb m4, m4
13967movh [r0 + 365 * 16], m4
13968
13969; mode 24 [row 14 - first half]
13970pmaddubsw m4, m7, [r5 + 21 * 16]
13971pmulhrsw m4, m3
13972packuswb m4, m4
13973movh [r0 + 366 * 16], m4
13974
13975; mode 24 [row 15 - first half]
13976pmaddubsw m4, m7, [r5 + 16 * 16]
13977pmulhrsw m4, m3
13978packuswb m4, m4
13979movh [r0 + 367 * 16], m4
13980
13981; mode 23 [row 3 - first half]
13982pslldq m7, m0, 2
13983pinsrb m7, [r4 + 0], 1
13984pinsrb m7, [r4 + 4], 0
13985pmaddubsw m4, m7, [r5 + 28 * 16]
13986pmulhrsw m4, m3
13987packuswb m4, m4
13988movh [r0 + 339 * 16], m4
13989
13990; mode 23 [row 4 - first half]
13991pmaddubsw m4, m7, [r5 + 19 * 16]
13992pmulhrsw m4, m3
13993packuswb m4, m4
13994movh [r0 + 340 * 16], m4
13995
13996; mode 23 [row 5 - first half]
13997pmaddubsw m4, m7, [r5 + 10 * 16]
13998pmulhrsw m4, m3
13999packuswb m4, m4
14000movh [r0 + 341 * 16], m4
14001
14002; mode 23 [row 6 - first half]
14003pmaddubsw m4, m7, [r5 + 1 * 16]
14004pmulhrsw m4, m3
14005packuswb m4, m4
14006movh [r0 + 342 * 16], m4
14007
14008; mode 23 [row 7 - first half]
14009pslldq m7, 2
14010pinsrb m7, [r4 + 4], 1
14011pinsrb m7, [r4 + 7], 0
14012pmaddubsw m4, m7, [r5 + 24 * 16]
14013pmulhrsw m4, m3
14014packuswb m4, m4
14015movh [r0 + 343 * 16], m4
14016
14017; mode 23 [row 8 - first half]
14018pmaddubsw m4, m7, [r5 + 15 * 16]
14019pmulhrsw m4, m3
14020packuswb m4, m4
14021movh [r0 + 344 * 16], m4
14022
14023; mode 23 [row 9 - first half]
14024pmaddubsw m4, m7, [r5 + 6 * 16]
14025pmulhrsw m4, m3
14026packuswb m4, m4
14027movh [r0 + 345 * 16], m4
14028
14029; mode 23 [row 10 - first half]
14030pslldq m7, 2
14031pinsrb m7, [r4 + 7], 1
14032pinsrb m7, [r4 + 11], 0
14033pmaddubsw m4, m7, [r5 + 29 * 16]
14034pmulhrsw m4, m3
14035packuswb m4, m4
14036movh [r0 + 346 * 16], m4
14037
14038; mode 23 [row 11 - first half]
14039pmaddubsw m4, m7, [r5 + 20 * 16]
14040pmulhrsw m4, m3
14041packuswb m4, m4
14042movh [r0 + 347 * 16], m4
14043
14044; mode 23 [row 12 - first half]
14045pmaddubsw m4, m7, [r5 + 11 * 16]
14046pmulhrsw m4, m3
14047packuswb m4, m4
14048movh [r0 + 348 * 16], m4
14049
14050; mode 23 [row 13 - first half]
14051pmaddubsw m4, m7, [r5 + 2 * 16]
14052pmulhrsw m4, m3
14053packuswb m4, m4
14054movh [r0 + 349 * 16], m4
14055
14056; mode 23 [row 14 - first half]
14057pslldq m7, 2
14058pinsrb m7, [r4 + 11], 1
14059pinsrb m7, [r4 + 14], 0
14060pmaddubsw m4, m7, [r5 + 25 * 16]
14061pmulhrsw m4, m3
14062packuswb m4, m4
14063movh [r0 + 350 * 16], m4
14064
14065; mode 23 [row 15 - first half]
14066pmaddubsw m4, m7, [r5 + 16 * 16]
14067pmulhrsw m4, m3
14068packuswb m4, m4
14069movh [r0 + 351 * 16], m4
14070
14071; mode 21 [row 15 - first half]
14072pmaddubsw m4, m0, [r5 + 16 * 16]
14073pmulhrsw m4, m3
14074packuswb m4, m4
14075movh [r0 + 319 * 16 + 8], m4
14076; mode 21 [row 15 - second half] end
14077
14078; mode 20 [row 1 - first half]
14079pslldq m7, m0, 2
14080pinsrb m7, [r4 + 0], 1
14081pinsrb m7, [r4 + 2], 0
14082pmaddubsw m4, m7, [r5 + 22 * 16]
14083pmulhrsw m4, m3
14084packuswb m4, m4
14085movh [r0 + 289 * 16], m4
14086
14087; mode 20 [row 2 - first half]
14088pmaddubsw m4, m7, [r5 + 1 * 16]
14089pmulhrsw m4, m3
14090packuswb m4, m4
14091movh [r0 + 290 * 16], m4
14092
14093; mode 21 [row 1 - first half]
14094pmaddubsw m4, m7, [r5 + 30 * 16]
14095pmulhrsw m4, m3
14096packuswb m4, m4
14097movh [r0 + 305 * 16], m4
14098
14099; mode 21 [row 2 - first half]
14100pmaddubsw m4, m7, [r5 + 13 * 16]
14101pmulhrsw m4, m3
14102packuswb m4, m4
14103movh [r0 + 306 * 16], m4
14104
14105; mode 22 [row 2 - first half]
14106pmaddubsw m4, m7, [r5 + 25 * 16]
14107pmulhrsw m4, m3
14108packuswb m4, m4
14109movh [r0 + 322 * 16], m4
14110
14111; mode 22 [row 3 - first half]
14112pmaddubsw m4, m7, [r5 + 12 * 16]
14113pmulhrsw m4, m3
14114packuswb m4, m4
14115movh [r0 + 323 * 16], m4
14116
14117; mode 22 [row 4 - first half]
14118pslldq m1, m7, 2
14119pinsrb m1, [r4 + 2], 1
14120pinsrb m1, [r4 + 5], 0
14121pmaddubsw m4, m1, [r5 + 31 * 16]
14122pmulhrsw m4, m3
14123packuswb m4, m4
14124movh [r0 + 324 * 16], m4
14125
14126; mode 22 [row 5 - first half]
14127pmaddubsw m4, m1, [r5 + 18 * 16]
14128pmulhrsw m4, m3
14129packuswb m4, m4
14130movh [r0 + 325 * 16], m4
14131
14132; mode 22 [row 6 - first half]
14133pmaddubsw m4, m1, [r5 + 5 * 16]
14134pmulhrsw m4, m3
14135packuswb m4, m4
14136movh [r0 + 326 * 16], m4
14137
14138; mode 22 [row 7 - first half]
14139pslldq m1, 2
14140pinsrb m1, [r4 + 5], 1
14141pinsrb m1, [r4 + 7], 0
14142pmaddubsw m4, m1, [r5 + 24 * 16]
14143pmulhrsw m4, m3
14144packuswb m4, m4
14145movh [r0 + 327 * 16], m4
14146
14147; mode 22 [row 8 - first half]
14148pmaddubsw m4, m1, [r5 + 11 * 16]
14149pmulhrsw m4, m3
14150packuswb m4, m4
14151movh [r0 + 328 * 16], m4
14152
14153; mode 22 [row 9 - first half]
14154pslldq m1, 2
14155pinsrb m1, [r4 + 7], 1
14156pinsrb m1, [r4 + 10], 0
14157pmaddubsw m4, m1, [r5 + 30 * 16]
14158pmulhrsw m4, m3
14159packuswb m4, m4
14160movh [r0 + 329 * 16], m4
14161
14162; mode 22 [row 10 - first half]
14163pmaddubsw m4, m1, [r5 + 17 * 16]
14164pmulhrsw m4, m3
14165packuswb m4, m4
14166movh [r0 + 330 * 16], m4
14167
14168; mode 22 [row 11 - first half]
14169pmaddubsw m4, m1, [r5 + 4 * 16]
14170pmulhrsw m4, m3
14171packuswb m4, m4
14172movh [r0 + 331 * 16], m4
14173
14174; mode 22 [row 12 - first half]
14175pslldq m1, 2
14176pinsrb m1, [r4 + 10], 1
14177pinsrb m1, [r4 + 12], 0
14178pmaddubsw m4, m1, [r5 + 23 * 16]
14179pmulhrsw m4, m3
14180packuswb m4, m4
14181movh [r0 + 332 * 16], m4
14182
14183; mode 22 [row 13 - first half]
14184pmaddubsw m4, m1, [r5 + 10 * 16]
14185pmulhrsw m4, m3
14186packuswb m4, m4
14187movh [r0 + 333 * 16], m4
14188
14189; mode 22 [row 14 - first half]
14190pslldq m1, 2
14191pinsrb m1, [r4 + 12], 1
14192pinsrb m1, [r4 + 15], 0
14193pmaddubsw m4, m1, [r5 + 29 * 16]
14194pmulhrsw m4, m3
14195packuswb m4, m4
14196movh [r0 + 334 * 16], m4
14197
14198; mode 22 [row 15 - first half]
14199pmaddubsw m4, m1, [r5 + 16 * 16]
14200pmulhrsw m4, m3
14201packuswb m4, m4
14202movh [r0 + 335 * 16], m4
14203
14204; mode 21 [row 3 - first half]
14205pslldq m6, m7, 2
14206pinsrb m6, [r4 + 2], 1
14207pinsrb m6, [r4 + 4], 0
14208pmaddubsw m4, m6, [r5 + 28 * 16]
14209pmulhrsw m4, m3
14210packuswb m4, m4
14211movh [r0 + 307 * 16], m4
14212
14213; mode 21 [row 4 - first half]
14214pmaddubsw m4, m6, [r5 + 11 * 16]
14215pmulhrsw m4, m3
14216packuswb m4, m4
14217movh [r0 + 308 * 16], m4
14218
14219; mode 21 [row 5 - first half]
14220pslldq m6, 2
14221pinsrb m6, [r4 + 4], 1
14222pinsrb m6, [r4 + 6], 0
14223pmaddubsw m4, m6, [r5 + 26 * 16]
14224pmulhrsw m4, m3
14225packuswb m4, m4
14226movh [r0 + 309 * 16], m4
14227
14228; mode 21 [row 6 - first half]
14229pmaddubsw m4, m6, [r5 + 9 * 16]
14230pmulhrsw m4, m3
14231packuswb m4, m4
14232movh [r0 + 310 * 16], m4
14233
14234; mode 21 [row 7 - first half]
14235pslldq m6, 2
14236pinsrb m6, [r4 + 6], 1
14237pinsrb m6, [r4 + 8], 0
14238pmaddubsw m4, m6, [r5 + 24 * 16]
14239pmulhrsw m4, m3
14240packuswb m4, m4
14241movh [r0 + 311 * 16], m4
14242
14243; mode 21 [row 8 - first half]
14244pmaddubsw m4, m6, [r5 + 7 * 16]
14245pmulhrsw m4, m3
14246packuswb m4, m4
14247movh [r0 + 312 * 16], m4
14248
14249; mode 21 [row 9 - first half]
14250pslldq m6, 2
14251pinsrb m6, [r4 + 8], 1
14252pinsrb m6, [r4 + 9], 0
14253pmaddubsw m4, m6, [r5 + 22 * 16]
14254pmulhrsw m4, m3
14255packuswb m4, m4
14256movh [r0 + 313 * 16], m4
14257
14258; mode 21 [row 10 - first half]
14259pmaddubsw m4, m6, [r5 + 5 * 16]
14260pmulhrsw m4, m3
14261packuswb m4, m4
14262movh [r0 + 314 * 16], m4
14263
14264; mode 21 [row 11 - first half]
14265pslldq m6, 2
14266pinsrb m6, [r4 + 9], 1
14267pinsrb m6, [r4 + 11], 0
14268pmaddubsw m4, m6, [r5 + 20 * 16]
14269pmulhrsw m4, m3
14270packuswb m4, m4
14271movh [r0 + 315 * 16], m4
14272
14273; mode 21 [row 12 - first half]
14274pmaddubsw m4, m6, [r5 + 3 * 16]
14275pmulhrsw m4, m3
14276packuswb m4, m4
14277movh [r0 + 316 * 16], m4
14278
14279; mode 21 [row 13 - first half]
14280pslldq m6, 2
14281pinsrb m6, [r4 + 11], 1
14282pinsrb m6, [r4 + 13], 0
14283pmaddubsw m4, m6, [r5 + 18 * 16]
14284pmulhrsw m4, m3
14285packuswb m4, m4
14286movh [r0 + 317 * 16], m4
14287
14288; mode 21 [row 14 - first half]
14289pmaddubsw m4, m6, [r5 + 1 * 16]
14290pmulhrsw m4, m3
14291packuswb m4, m4
14292movh [r0 + 318 * 16], m4
14293
14294; mode 21 [row 15 - first half]
14295pslldq m6, 2
14296pinsrb m6, [r4 + 13], 1
14297pinsrb m6, [r4 + 15], 0
14298pmaddubsw m4, m6, [r5 + 16 * 16]
14299pmulhrsw m4, m3
14300packuswb m4, m4
14301movh [r0 + 319 * 16], m4
14302
14303; mode 20 [row 13 - second half]
14304pmaddubsw m4, m7, [r5 + 26 * 16]
14305pmulhrsw m4, m3
14306packuswb m4, m4
14307movh [r0 + 301 * 16 + 8], m4
14308; mode 20 [row 13 - second half]
14309
14310; mode 20 [row 14 - second half]
14311pmaddubsw m4, m7, [r5 + 5 * 16]
14312pmulhrsw m4, m3
14313packuswb m4, m4
14314movh [r0 + 302 * 16 + 8], m4
14315; mode 20 [row 14 - second half]
14316
14317; mode 20 [row 3 - first half]
14318pslldq m7, 2
14319pinsrb m7, [r4 + 2], 1
14320pinsrb m7, [r4 + 3], 0
14321pmaddubsw m4, m7, [r5 + 12 * 16]
14322pmulhrsw m4, m3
14323packuswb m4, m4
14324movh [r0 + 291 * 16], m4
14325
14326; mode 20 [row 15 - second half]
14327pmaddubsw m4, m7, [r5 + 16 * 16]
14328pmulhrsw m4, m3
14329packuswb m4, m4
14330movh [r0 + 303 * 16 + 8], m4
14331; mode 20 [row 15 - second half]
14332
14333; mode 20 [row 4 - first half]
14334pslldq m7, 2
14335pinsrb m7, [r4 + 3], 1
14336pinsrb m7, [r4 + 5], 0
14337pmaddubsw m4, m7, [r5 + 23 * 16]
14338pmulhrsw m4, m3
14339packuswb m4, m4
14340movh [r0 + 292 * 16], m4
14341
14342; mode 20 [row 5 - first half]
14343pmaddubsw m4, m7, [r5 + 2 * 16]
14344pmulhrsw m4, m3
14345packuswb m4, m4
14346movh [r0 + 293 * 16], m4
14347
14348; mode 20 [row 6 - first half]
14349pslldq m7, 2
14350pinsrb m7, [r4 + 5], 1
14351pinsrb m7, [r4 + 6], 0
14352pmaddubsw m4, m7, [r5 + 13 * 16]
14353pmulhrsw m4, m3
14354packuswb m4, m4
14355movh [r0 + 294 * 16], m4
14356
14357; mode 20 [row 7 - first half]
14358pslldq m7, 2
14359pinsrb m7, [r4 + 6], 1
14360pinsrb m7, [r4 + 8], 0
14361pmaddubsw m4, m7, [r5 + 24 * 16]
14362pmulhrsw m4, m3
14363packuswb m4, m4
14364movh [r0 + 295 * 16], m4
14365
14366; mode 20 [row 8 - first half]
14367pmaddubsw m4, m7, [r5 + 3 * 16]
14368pmulhrsw m4, m3
14369packuswb m4, m4
14370movh [r0 + 296 * 16], m4
14371
14372; mode 20 [row 9 - first half]
14373pslldq m7, 2
14374pinsrb m7, [r4 + 8], 1
14375pinsrb m7, [r4 + 9], 0
14376pmaddubsw m4, m7, [r5 + 14 * 16]
14377pmulhrsw m4, m3
14378packuswb m4, m4
14379movh [r0 + 297 * 16], m4
14380
14381; mode 20 [row 10 - first half]
14382pslldq m7, 2
14383pinsrb m7, [r4 + 9], 1
14384pinsrb m7, [r4 + 11], 0
14385pmaddubsw m4, m7, [r5 + 25 * 16]
14386pmulhrsw m4, m3
14387packuswb m4, m4
14388movh [r0 + 298 * 16], m4
14389
14390; mode 20 [row 11 - first half]
14391pmaddubsw m4, m7, [r5 + 4 * 16]
14392pmulhrsw m4, m3
14393packuswb m4, m4
14394movh [r0 + 299 * 16], m4
14395
14396; mode 20 [row 12 - first half]
14397movu m1, [r5 + 15 * 16]
14398pslldq m7, 2
14399pinsrb m7, [r4 + 11], 1
14400pinsrb m7, [r4 + 12], 0
14401pmaddubsw m4, m7, [r5 + 15 * 16]
14402pmulhrsw m4, m3
14403packuswb m4, m4
14404movh [r0 + 300 * 16], m4
14405
14406; mode 20 [row 13 - first half]
14407pslldq m7, 2
14408pinsrb m7, [r4 + 12], 1
14409pinsrb m7, [r4 + 14], 0
14410pmaddubsw m4, m7, [r5 + 26 * 16]
14411pmulhrsw m4, m3
14412packuswb m4, m4
14413movh [r0 + 301 * 16], m4
14414
14415; mode 20 [row 14 - first half]
14416pmaddubsw m4, m7, [r5 + 5 * 16]
14417pmulhrsw m4, m3
14418packuswb m4, m4
14419movh [r0 + 302 * 16], m4
14420
14421; mode 20 [row 15 - first half]
14422pslldq m7, 2
14423pinsrb m7, [r4 + 14], 1
14424pinsrb m7, [r4 + 15], 0
14425pmaddubsw m4, m7, [r5 + 16 * 16]
14426pmulhrsw m4, m3
14427packuswb m4, m4
14428movh [r0 + 303 * 16], m4
14429
14430; mode 19 [row 1]
14431pslldq m0, 2
14432pinsrb m0, [r4 + 0], 1
14433pinsrb m0, [r4 + 1], 0
14434pslldq m5, 2
14435pinsrb m5, [r3 + 8], 1
14436pinsrb m5, [r3 + 7], 0
14437
14438; mode 20 [row 1 - second half]
14439pmaddubsw m4, m5, [r5 + 22 * 16]
14440pmulhrsw m4, m3
14441packuswb m4, m4
14442movh [r0 + 289 * 16 + 8], m4
14443; mode 20 [row 1 - second half] end
14444
14445; mode 20 [row 2 - second half]
14446pmaddubsw m4, m5, [r5 + 1 * 16]
14447pmulhrsw m4, m3
14448packuswb m4, m4
14449movh [r0 + 290 * 16 + 8], m4
14450; mode 20 [row 2 - second half] end
14451
14452; mode 21 [row 2 - second half]
14453pmaddubsw m4, m5, [r5 + 30 * 16]
14454pmulhrsw m4, m3
14455packuswb m4, m4
14456movh [r0 + 305 * 16 + 8], m4
14457; mode 21 [row 2 - second half] end
14458
14459; mode 21 [row 3 - second half]
14460pmaddubsw m4, m5, [r5 + 13 * 16]
14461pmulhrsw m4, m3
14462packuswb m4, m4
14463movh [r0 + 306 * 16 + 8], m4
14464; mode 21 [row 3 - second half] end
14465
14466; mode 21 [row 4 - second half]
14467pmaddubsw m4, m5, [r5 + 11 * 16]
14468pmulhrsw m4, m3
14469packuswb m4, m4
14470movh [r0 + 307 * 16 + 8], m4
14471; mode 21 [row 4 - second half] end
14472
14473; mode 22 [row 2 - second half]
14474pmaddubsw m4, m5, [r5 + 25 * 16]
14475pmulhrsw m4, m3
14476packuswb m4, m4
14477movh [r0 + 322 * 16 + 8], m4
14478; mode 22 [row 2 - second half] end
14479
14480; mode 22 [row 3 - second half]
14481pmaddubsw m4, m5, [r5 + 12 * 16]
14482pmulhrsw m4, m3
14483packuswb m4, m4
14484movh [r0 + 323 * 16 + 8], m4
14485; mode 22 [row 3 - second half] end
14486
14487; mode 23 [row 3 - second half]
14488pmaddubsw m4, m5, [r5 + 28 * 16]
14489pmulhrsw m4, m3
14490packuswb m4, m4
14491movh [r0 + 339 * 16 + 8], m4
14492; mode 23 [row 3 - second half] end
14493
14494; mode 23 [row 4 - second half]
14495pmaddubsw m4, m5, [r5 + 19 * 16]
14496pmulhrsw m4, m3
14497packuswb m4, m4
14498movh [r0 + 340 * 16 + 8], m4
14499; mode 23 [row 4 - second half] end
14500
14501; mode 23 [row 5 - second half]
14502pmaddubsw m4, m5, [r5 + 10 * 16]
14503pmulhrsw m4, m3
14504packuswb m4, m4
14505movh [r0 + 341 * 16 + 8], m4
14506; mode 23 [row 5 - second half] end
14507
14508; mode 23 [row 6 - second half]
14509pmaddubsw m4, m5, [r5 + 1 * 16]
14510pmulhrsw m4, m3
14511packuswb m4, m4
14512movh [r0 + 342 * 16 + 8], m4
14513; mode 23 [row 6 - second half] end
14514
14515; mode 24 [row 6 - second half]
14516pmaddubsw m4, m5, [r5 + 29 * 16]
14517pmulhrsw m4, m3
14518packuswb m4, m4
14519movh [r0 + 358 * 16 + 8], m4
14520; mode 24 [row 6 - second half] end
14521
14522; mode 24 [row 7 - second half]
14523pmaddubsw m4, m5, [r5 + 24 * 16]
14524pmulhrsw m4, m3
14525packuswb m4, m4
14526movh [r0 + 359 * 16 + 8], m4
14527; mode 24 [row 7 - second half] end
14528
14529; mode 24 [row 8 - second half]
14530pmaddubsw m4, m5, [r5 + 19 * 16]
14531pmulhrsw m4, m3
14532packuswb m4, m4
14533movh [r0 + 360 * 16 + 8], m4
14534; mode 24 [row 8 - second half] end
14535
14536; mode 24 [row 9 - second half]
14537pmaddubsw m4, m5, [r5 + 14 * 16]
14538pmulhrsw m4, m3
14539packuswb m4, m4
14540movh [r0 + 361 * 16 + 8], m4
14541; mode 24 [row 9 - second half] end
14542
14543; mode 24 [row 10 - second half]
14544pmaddubsw m4, m5, [r5 + 9 * 16]
14545pmulhrsw m4, m3
14546packuswb m4, m4
14547movh [r0 + 362 * 16 + 8], m4
14548; mode 24 [row 10 - second half] end
14549
14550; mode 24 [row 11 - second half]
14551pmaddubsw m4, m5, [r5 + 4 * 16]
14552pmulhrsw m4, m3
14553packuswb m4, m4
14554movh [r0 + 363 * 16 + 8], m4
14555; mode 24 [row 11 - second half] end
14556
14557pmaddubsw m4, m0, [r5 + 12 * 16]
14558pmulhrsw m4, m3
14559pmaddubsw m6, m5, [r5 + 12 * 16]
14560pmulhrsw m6, m3
14561packuswb m4, m6
14562movu [r0 + 273 * 16], m4
14563
14564; mode 19 [row 2]
14565pslldq m0, 2
14566pinsrb m0, [r4 + 1], 1
14567pinsrb m0, [r4 + 2], 0
14568pslldq m5, 2
14569pinsrb m5, [r3 + 7], 1
14570pinsrb m5, [r3 + 6], 0
14571
14572; mode 20 [row 3 - second half]
14573pmaddubsw m4, m5, [r5 + 12 * 16]
14574pmulhrsw m4, m3
14575packuswb m4, m4
14576movh [r0 + 291 * 16 + 8], m4
14577; mode 20 [row 3 - second half] end
14578
14579; mode 21 [row 3 - second half]
14580pmaddubsw m4, m5, [r5 + 28 * 16]
14581pmulhrsw m4, m3
14582packuswb m4, m4
14583movh [r0 + 307 * 16 + 8], m4
14584; mode 21 [row 3 - second half] end
14585
14586; mode 21 [row 4 - second half]
14587pmaddubsw m4, m5, [r5 + 11 * 16]
14588pmulhrsw m4, m3
14589packuswb m4, m4
14590movh [r0 + 308 * 16 + 8], m4
14591; mode 21 [row 4 - second half] end
14592
14593; mode 22 [row 4 - second half]
14594pmaddubsw m4, m5, [r5 + 31 * 16]
14595pmulhrsw m4, m3
14596packuswb m4, m4
14597movh [r0 + 324 * 16 + 8], m4
14598; mode 22 [row 4 - second half] end
14599
14600; mode 22 [row 5 - second half]
14601pmaddubsw m4, m5, [r5 + 18 * 16]
14602pmulhrsw m4, m3
14603packuswb m4, m4
14604movh [r0 + 325 * 16 + 8], m4
14605; mode 22 [row 5 - second half] end
14606
14607; mode 22 [row 6 - second half]
14608pmaddubsw m4, m5, [r5 + 5 * 16]
14609pmulhrsw m4, m3
14610packuswb m4, m4
14611movh [r0 + 326 * 16 + 8], m4
14612; mode 22 [row 6 - second half] end
14613
14614; mode 23 [row 7 - second half]
14615pmaddubsw m4, m5, [r5 + 24 * 16]
14616pmulhrsw m4, m3
14617packuswb m4, m4
14618movh [r0 + 343 * 16 + 8], m4
14619; mode 23 [row 7 - second half] end
14620
14621; mode 23 [row 8 - second half]
14622pmaddubsw m4, m5, [r5 + 15 * 16]
14623pmulhrsw m4, m3
14624packuswb m4, m4
14625movh [r0 + 344 * 16 + 8], m4
14626; mode 23 [row 8 - second half] end
14627
14628; mode 23 [row 9 - second half]
14629pmaddubsw m4, m5, [r5 + 6 * 16]
14630pmulhrsw m4, m3
14631packuswb m4, m4
14632movh [r0 + 345 * 16 + 8], m4
14633; mode 23 [row 9 - second half] end
14634
14635; mode 24 [row 12 - second half]
14636pmaddubsw m4, m5, [r5 + 31 * 16]
14637pmulhrsw m4, m3
14638packuswb m4, m4
14639movh [r0 + 364 * 16 + 8], m4
14640; mode 24 [row 12 - second half] end
14641
14642; mode 24 [row 13 - second half]
14643pmaddubsw m4, m5, [r5 + 26 * 16]
14644pmulhrsw m4, m3
14645packuswb m4, m4
14646movh [r0 + 365 * 16 + 8], m4
14647; mode 24 [row 13 - second half] end
14648
14649; mode 24 [row 14 - second half]
14650pmaddubsw m4, m5, [r5 + 21 * 16]
14651pmulhrsw m4, m3
14652packuswb m4, m4
14653movh [r0 + 366 * 16 + 8], m4
14654; mode 24 [row 14 - second half] end
14655
14656; mode 24 [row 15 - second half]
14657pmaddubsw m4, m5, [r5 + 16 * 16]
14658pmulhrsw m4, m3
14659packuswb m4, m4
14660movh [r0 + 367 * 16 + 8], m4
14661; mode 24 [row 15 - second half] end
14662
14663pmaddubsw m4, m0, [r5 + 18 * 16]
14664pmulhrsw m4, m3
14665pmaddubsw m6, m5, [r5 + 18 * 16]
14666pmulhrsw m6, m3
14667packuswb m4, m6
14668movu [r0 + 274 * 16], m4
14669
14670; mode 19 [row 3]
14671pslldq m0, 2
14672pinsrb m0, [r4 + 2], 1
14673pinsrb m0, [r4 + 4], 0
14674pslldq m5, 2
14675pinsrb m5, [r3 + 6], 1
14676pinsrb m5, [r3 + 5], 0
14677
14678; mode 20 [row 4 - second half]
14679pmaddubsw m4, m5, [r5 + 23 * 16]
14680pmulhrsw m4, m3
14681packuswb m4, m4
14682movh [r0 + 292 * 16 + 8], m4
14683; mode 20 [row 4 - second half] end
14684
14685; mode 20 [row 5 - second half]
14686pmaddubsw m4, m5, [r5 + 2 * 16]
14687pmulhrsw m4, m3
14688packuswb m4, m4
14689movh [r0 + 293 * 16 + 8], m4
14690; mode 20 [row 5 - second half] end
14691
14692; mode 21 [row 5 - second half]
14693pmaddubsw m4, m5, [r5 + 26 * 16]
14694pmulhrsw m4, m3
14695packuswb m4, m4
14696movh [r0 + 309 * 16 + 8], m4
14697; mode 21 [row 5 - second half] end
14698
14699; mode 21 [row 6 - second half]
14700pmaddubsw m4, m5, [r5 + 9 * 16]
14701pmulhrsw m4, m3
14702packuswb m4, m4
14703movh [r0 + 310 * 16 + 8], m4
14704; mode 21 [row 6 - second half] end
14705
14706; mode 22 [row 7 - second half]
14707pmaddubsw m4, m5, [r5 + 24 * 16]
14708pmulhrsw m4, m3
14709packuswb m4, m4
14710movh [r0 + 327 * 16 + 8], m4
14711; mode 22 [row 7 - second half] end
14712
14713; mode 22 [row 8 - second half]
14714pmaddubsw m4, m5, [r5 + 11 * 16]
14715pmulhrsw m4, m3
14716packuswb m4, m4
14717movh [r0 + 328 * 16 + 8], m4
14718; mode 22 [row 7 - second half] end
14719
14720; mode 23 [row 10 - second half]
14721pmaddubsw m4, m5, [r5 + 29 * 16]
14722pmulhrsw m4, m3
14723packuswb m4, m4
14724movh [r0 + 346 * 16 + 8], m4
14725; mode 23 [row 10 - second half] end
14726
14727; mode 23 [row 11 - second half]
14728pmaddubsw m4, m5, [r5 + 20 * 16]
14729pmulhrsw m4, m3
14730packuswb m4, m4
14731movh [r0 + 347 * 16 + 8], m4
14732; mode 23 [row 11 - second half] end
14733
14734; mode 23 [row 12 - second half]
14735pmaddubsw m4, m5, [r5 + 11 * 16]
14736pmulhrsw m4, m3
14737packuswb m4, m4
14738movh [r0 + 348 * 16 + 8], m4
14739; mode 23 [row 12 - second half] end
14740
14741; mode 23 [row 13 - second half]
14742pmaddubsw m4, m5, [r5 + 2 * 16]
14743pmulhrsw m4, m3
14744packuswb m4, m4
14745movh [r0 + 349 * 16 + 8], m4
14746; mode 23 [row 13 - second half] end
14747
14748pmaddubsw m4, m0, [r5 + 24 * 16]
14749pmulhrsw m4, m3
14750pmaddubsw m6, m5, [r5 + 24 * 16]
14751pmulhrsw m6, m3
14752packuswb m4, m6
14753movu [r0 + 275 * 16], m4
14754
14755; mode 19 [row 4]
14756pslldq m0, 2
14757pinsrb m0, [r4 + 4], 1
14758pinsrb m0, [r4 + 5], 0
14759pslldq m5, 2
14760pinsrb m5, [r3 + 5], 1
14761pinsrb m5, [r3 + 4], 0
14762
14763; mode 20 [row 6 - second half]
14764pmaddubsw m4, m5, [r5 + 13 * 16]
14765pmulhrsw m4, m3
14766packuswb m4, m4
14767movh [r0 + 294 * 16 + 8], m4
14768; mode 20 [row 6 - second half] end
14769
14770; mode 21 [row 7 - second half]
14771pmaddubsw m4, m5, [r5 + 24 * 16]
14772pmulhrsw m4, m3
14773packuswb m4, m4
14774movh [r0 + 311 * 16 + 8], m4
14775; mode 21 [row 7 - second half] end
14776
14777; mode 21 [row 8 - second half]
14778pmaddubsw m4, m5, [r5 + 7 * 16]
14779pmulhrsw m4, m3
14780packuswb m4, m4
14781movh [r0 + 312 * 16 + 8], m4
14782; mode 21 [row 8 - second half] end
14783
14784; mode 22 [row 9 - second half]
14785pmaddubsw m4, m5, [r5 + 30 * 16]
14786pmulhrsw m4, m3
14787packuswb m4, m4
14788movh [r0 + 329 * 16 + 8], m4
14789; mode 22 [row 9 - second half] end
14790
14791; mode 22 [row 10 - second half]
14792pmaddubsw m4, m5, [r5 + 17 * 16]
14793pmulhrsw m4, m3
14794packuswb m4, m4
14795movh [r0 + 330 * 16 + 8], m4
14796; mode 22 [row 10 - second half] end
14797
14798; mode 22 [row 11 - second half]
14799pmaddubsw m4, m5, [r5 + 4 * 16]
14800pmulhrsw m4, m3
14801packuswb m4, m4
14802movh [r0 + 331 * 16 + 8], m4
14803; mode 22 [row 11 - second half] end
14804
14805; mode 23 [row 14 - second half]
14806pmaddubsw m4, m5, [r5 + 25 * 16]
14807pmulhrsw m4, m3
14808packuswb m4, m4
14809movh [r0 + 350 * 16 + 8], m4
14810; mode 23 [row 14 - second half] end
14811
14812; mode 23 [row 15 - second half]
14813pmaddubsw m4, m5, [r5 + 16 * 16]
14814pmulhrsw m4, m3
14815packuswb m4, m4
14816movh [r0 + 351 * 16 + 8], m4
14817
14818; mode 23 [row 15 - second half] end
14819pmaddubsw m4, m0, [r5 + 30 * 16]
14820pmulhrsw m4, m3
14821pmaddubsw m6, m5, [r5 + 30 * 16]
14822pmulhrsw m6, m3
14823packuswb m4, m6
14824movu [r0 + 276 * 16], m4
14825
14826; mode 19 [row 5]
14827pmaddubsw m4, m0, [r5 + 4 * 16]
14828pmulhrsw m4, m3
14829pmaddubsw m6, m5, [r5 + 4 * 16]
14830pmulhrsw m6, m3
14831packuswb m4, m6
14832movu [r0 + 277 * 16], m4
14833
14834; mode 19 [row 6]
14835pslldq m0, 2
14836pinsrb m0, [r4 + 5], 1
14837pinsrb m0, [r4 + 6], 0
14838pslldq m5, 2
14839pinsrb m5, [r3 + 4], 1
14840pinsrb m5, [r3 + 3], 0
14841
14842; mode 20 [row 7 - second half]
14843pmaddubsw m4, m5, [r5 + 24 * 16]
14844pmulhrsw m4, m3
14845packuswb m4, m4
14846movh [r0 + 295 * 16 + 8], m4
14847; mode 20 [row 7 - second half] end
14848
14849; mode 20 [row 8 - second half]
14850pmaddubsw m4, m5, [r5 + 3 * 16]
14851pmulhrsw m4, m3
14852packuswb m4, m4
14853movh [r0 + 296 * 16 + 8], m4
14854; mode 20 [row 8 - second half] end
14855
14856; mode 21 [row 9 - second half]
14857pmaddubsw m4, m5, [r5 + 22 * 16]
14858pmulhrsw m4, m3
14859packuswb m4, m4
14860movh [r0 + 313 * 16 + 8], m4
14861; mode 21 [row 9 - second half] end
14862
14863; mode 21 [row 10 - second half]
14864pmaddubsw m4, m5, [r5 + 5 * 16]
14865pmulhrsw m4, m3
14866packuswb m4, m4
14867movh [r0 + 314 * 16 + 8], m4
14868; mode 21 [row 10 - second half] end
14869
14870; mode 22 [row 12 - second half]
14871pmaddubsw m4, m5, [r5 + 23 * 16]
14872pmulhrsw m4, m3
14873packuswb m4, m4
14874movh [r0 + 332 * 16 + 8], m4
14875; mode 22 [row 12 - second half] end
14876
14877; mode 22 [row 12 - second half]
14878pmaddubsw m4, m5, [r5 + 10 * 16]
14879pmulhrsw m4, m3
14880packuswb m4, m4
14881movh [r0 + 333 * 16 + 8], m4
14882; mode 22 [row 12 - second half] end
14883
14884pmaddubsw m4, m0, [r5 + 10 * 16]
14885pmulhrsw m4, m3
14886pmaddubsw m6, m5, [r5 + 10 * 16]
14887pmulhrsw m6, m3
14888packuswb m4, m6
14889movu [r0 + 278 * 16], m4
14890
14891; mode 19 [row 7]
14892pslldq m0, 2
14893pinsrb m0, [r4 + 6], 1
14894pinsrb m0, [r4 + 7], 0
14895pslldq m5, 2
14896pinsrb m5, [r3 + 3], 1
14897pinsrb m5, [r3 + 2], 0
14898
14899; mode 20 [row 9 - second half]
14900pmaddubsw m4, m5, [r5 + 14 * 16]
14901pmulhrsw m4, m3
14902packuswb m4, m4
14903movh [r0 + 297 * 16 + 8], m4
14904; mode 20 [row 9 - second half]
14905
14906; mode 21 [row 11 - second half]
14907pmaddubsw m4, m5, [r5 + 20 * 16]
14908pmulhrsw m4, m3
14909packuswb m4, m4
14910movh [r0 + 315 * 16 + 8], m4
14911; mode 21 [row 11 - second half] end
14912
14913; mode 21 [row 12 - second half]
14914pmaddubsw m4, m5, [r5 + 3 * 16]
14915pmulhrsw m4, m3
14916packuswb m4, m4
14917movh [r0 + 316 * 16 + 8], m4
14918; mode 21 [row 12 - second half] end
14919
14920; mode 22 [row 14 - second half]
14921pmaddubsw m4, m5, [r5 + 29 * 16]
14922pmulhrsw m4, m3
14923packuswb m4, m4
14924movh [r0 + 334 * 16 + 8], m4
14925; mode 22 [row 14 - second half] end
14926
14927; mode 22 [row 15 - second half]
14928pmaddubsw m4, m5, [r5 + 16 * 16]
14929pmulhrsw m4, m3
14930packuswb m4, m4
14931movh [r0 + 335 * 16 + 8], m4
14932; mode 22 [row 15 - second half] end
14933
14934pmaddubsw m4, m0, [r5 + 16 * 16]
14935pmulhrsw m4, m3
14936pmaddubsw m6, m5, [r5 + 16 * 16]
14937pmulhrsw m6, m3
14938packuswb m4, m6
14939movu [r0 + 279 * 16], m4
14940
14941; mode 19 [row 8]
14942pslldq m0, 2
14943pinsrb m0, [r4 + 7], 1
14944pinsrb m0, [r4 + 9], 0
14945pslldq m5, 2
14946pinsrb m5, [r3 + 2], 1
14947pinsrb m5, [r3 + 1], 0
14948
14949; mode 20 [row 10 - second half]
14950pmaddubsw m4, m5, [r5 + 25 * 16]
14951pmulhrsw m4, m3
14952packuswb m4, m4
14953movh [r0 + 298 * 16 + 8], m4
14954; mode 20 [row 10 - second half] end
14955
14956; mode 20 [row 11 - second half]
14957pmaddubsw m4, m5, [r5 + 4 * 16]
14958pmulhrsw m4, m3
14959packuswb m4, m4
14960movh [r0 + 299 * 16 + 8], m4
14961; mode 20 [row 11 - second half] end
14962
14963; mode 21 [row 13 - second half]
14964pmaddubsw m4, m5, [r5 + 18 * 16]
14965pmulhrsw m4, m3
14966packuswb m4, m4
14967movh [r0 + 317 * 16 + 8], m4
14968; mode 21 [row 13 - second half] end
14969
14970; mode 21 [row 14 - second half]
14971pmaddubsw m4, m5, [r5 + 1 * 16]
14972pmulhrsw m4, m3
14973packuswb m4, m4
14974movh [r0 + 318 * 16 + 8], m4
14975; mode 21 [row 14 - second half] end
14976
14977pmaddubsw m4, m0, [r5 + 22 * 16]
14978pmulhrsw m4, m3
14979pmaddubsw m6, m5, [r5 + 22 * 16]
14980pmulhrsw m6, m3
14981packuswb m4, m6
14982movu [r0 + 280 * 16], m4
14983
14984; mode 19 [row 9]
14985pslldq m0, 2
14986pinsrb m0, [r4 + 9], 1
14987pinsrb m0, [r4 + 10], 0
14988pslldq m5, 2
14989pinsrb m5, [r3 + 1], 1
14990pinsrb m5, [r3 + 0], 0
14991
14992; mode 20 [row 12 - second half]
14993pmaddubsw m4, m5, [r5 + 15 * 16]
14994pmulhrsw m4, m3
14995packuswb m4, m4
14996movh [r0 + 300 * 16 + 8], m4
14997
14998; mode 20 [row 12 - second half] end
14999pmaddubsw m4, m0, [r5 + 28 * 16]
15000pmulhrsw m4, m3
15001pmaddubsw m6, m5, [r5 + 28 * 16]
15002pmulhrsw m6, m3
15003packuswb m4, m6
15004movu [r0 + 281 * 16], m4
15005
15006; mode 19 [row 10]
15007pmaddubsw m4, m0, [r5 + 2 * 16]
15008pmulhrsw m4, m3
15009pmaddubsw m6, m5, [r5 + 2 * 16]
15010pmulhrsw m6, m3
15011packuswb m4, m6
15012movu [r0 + 282 * 16], m4
15013
15014; mode 19 [row 11]
15015pslldq m0, 2
15016pinsrb m0, [r4 + 10], 1
15017pinsrb m0, [r4 + 11], 0
15018pmaddubsw m4, m0, [r5 + 8 * 16]
15019pmulhrsw m4, m3
15020pslldq m5, 2
15021pinsrb m5, [r4 + 0], 1
15022pinsrb m5, [r4 + 1], 0
15023pmaddubsw m6, m5, [r5 + 8 * 16]
15024pmulhrsw m6, m3
15025packuswb m4, m6
15026movu [r0 + 283 * 16], m4
15027
15028; mode 19 [row 12]
15029pslldq m0, 2
15030pinsrb m0, [r4 + 11], 1
15031pinsrb m0, [r4 + 12], 0
15032pslldq m5, 2
15033pinsrb m5, [r4 + 1], 1
15034pinsrb m5, [r4 + 2], 0
15035pmaddubsw m4, m0, [r5 + 14 * 16]
15036pmulhrsw m4, m3
15037pmaddubsw m6, m5, [r5 + 14 * 16]
15038pmulhrsw m6, m3
15039packuswb m4, m6
15040movu [r0 + 284 * 16], m4
15041
15042; mode 19 [row 13]
15043pslldq m0, 2
15044pinsrb m0, [r4 + 12], 1
15045pinsrb m0, [r4 + 14], 0
15046pmaddubsw m4, m0, [r5 + 20 * 16]
15047pmulhrsw m4, m3
15048pslldq m5, 2
15049pinsrb m5, [r4 + 2], 1
15050pinsrb m5, [r4 + 4], 0
15051pmaddubsw m6, m5, [r5 + 20 * 16]
15052pmulhrsw m6, m3
15053packuswb m4, m6
15054movu [r0 + 285 * 16], m4
15055
15056; mode 19 [row 14]
15057pslldq m0, 2
15058pinsrb m0, [r4 + 14], 1
15059pinsrb m0, [r4 + 15], 0
15060pmaddubsw m4, m0, [r5 + 26 * 16]
15061pmulhrsw m4, m3
15062pslldq m5, 2
15063pinsrb m5, [r4 + 4], 1
15064pinsrb m5, [r4 + 5], 0
15065pmaddubsw m6, m5, [r5 + 26 * 16]
15066pmulhrsw m6, m3
15067packuswb m4, m6
15068movu [r0 + 286 * 16], m4
15069
15070; mode 19 [row 15]
15071movu m0, [r4]
15072pshufb m0, [tab_S1]
15073movu [r0 + 287 * 16], m0
15074movd m1, [r3]
15075movd [r0 + 287 * 16 + 12], m1
15076
15077; mode 25
15078movu m1, [r1]
15079
15080; mode 26 [all rows]
15081psrldq m6, m1, 1
15082pinsrb m6, [r1 + 16], 15
15083movu m7, m6
15084movu [r0 + 384 * 16], m6
15085movu [r0 + 385 * 16], m6
15086movu [r0 + 386 * 16], m6
15087movu [r0 + 387 * 16], m6
15088movu [r0 + 388 * 16], m6
15089movu [r0 + 389 * 16], m6
15090movu [r0 + 390 * 16], m6
15091movu [r0 + 391 * 16], m6
15092movu [r0 + 392 * 16], m6
15093movu [r0 + 393 * 16], m6
15094movu [r0 + 394 * 16], m6
15095movu [r0 + 395 * 16], m6
15096movu [r0 + 396 * 16], m6
15097movu [r0 + 397 * 16], m6
15098movu [r0 + 398 * 16], m6
15099movu [r0 + 399 * 16], m6
15100
15101pxor m0, m0
15102pshufb m6, m6, m0
15103punpcklbw m6, m0
15104movu m2, [r2]
15105pshufb m2, m2, m0
15106punpcklbw m2, m0
15107movu m4, [r2 + 1]
15108punpcklbw m5, m4, m0
15109punpckhbw m4, m0
15110psubw m5, m2
15111psubw m4, m2
15112psraw m5, 1
15113psraw m4, 1
15114paddw m5, m6
15115paddw m4, m6
15116packuswb m5, m4
15117
15118pextrb [r0 + 384 * 16], m5, 0
15119pextrb [r0 + 385 * 16], m5, 1
15120pextrb [r0 + 386 * 16], m5, 2
15121pextrb [r0 + 387 * 16], m5, 3
15122pextrb [r0 + 388 * 16], m5, 4
15123pextrb [r0 + 389 * 16], m5, 5
15124pextrb [r0 + 390 * 16], m5, 6
15125pextrb [r0 + 391 * 16], m5, 7
15126pextrb [r0 + 392 * 16], m5, 8
15127pextrb [r0 + 393 * 16], m5, 9
15128pextrb [r0 + 394 * 16], m5, 10
15129pextrb [r0 + 395 * 16], m5, 11
15130pextrb [r0 + 396 * 16], m5, 12
15131pextrb [r0 + 397 * 16], m5, 13
15132pextrb [r0 + 398 * 16], m5, 14
15133pextrb [r0 + 399 * 16], m5, 15
15134
15135; mode 25 [row 15]
15136movu [r0 + 383 * 16], m1
15137
15138; mode 25 [row 0]
15139psrldq m2, m1, 1
15140punpcklbw m1, m2
15141movu m2, [r1 + 8]
15142psrldq m4, m2, 1
15143punpcklbw m2, m4
15144pmaddubsw m4, m1, [r5 + 30 * 16]
15145pmulhrsw m4, m3
15146pmaddubsw m5, m2, [r5 + 30 * 16]
15147pmulhrsw m5, m3
15148packuswb m4, m5
15149movu [r0 + 368 * 16], m4
15150
15151; mode 25 [row 1]
15152pmaddubsw m4, m1, [r5 + 28 * 16]
15153pmulhrsw m4, m3
15154pmaddubsw m5, m2, [r5 + 28 * 16]
15155pmulhrsw m5, m3
15156packuswb m4, m5
15157movu [r0 + 369 * 16], m4
15158
15159; mode 25 [row 2]
15160pmaddubsw m4, m1, [r5 + 26 * 16]
15161pmulhrsw m4, m3
15162pmaddubsw m5, m2, [r5 + 26 * 16]
15163pmulhrsw m5, m3
15164packuswb m4, m5
15165movu [r0 + 370 * 16], m4
15166
15167; mode 25 [row 3]
15168pmaddubsw m4, m1, [r5 + 24 * 16]
15169pmulhrsw m4, m3
15170pmaddubsw m5, m2, [r5 + 24 * 16]
15171pmulhrsw m5, m3
15172packuswb m4, m5
15173movu [r0 + 371 * 16], m4
15174
15175; mode 25 [row 4]
15176pmaddubsw m4, m1, [r5 + 22 * 16]
15177pmulhrsw m4, m3
15178pmaddubsw m5, m2, [r5 + 22 * 16]
15179pmulhrsw m5, m3
15180packuswb m4, m5
15181movu [r0 + 372 * 16], m4
15182
15183; mode 25 [row 5]
15184pmaddubsw m4, m1, [r5 + 20 * 16]
15185pmulhrsw m4, m3
15186pmaddubsw m5, m2, [r5 + 20 * 16]
15187pmulhrsw m5, m3
15188packuswb m4, m5
15189movu [r0 + 373 * 16], m4
15190
15191; mode 25 [row 6]
15192pmaddubsw m4, m1, [r5 + 18 * 16]
15193pmulhrsw m4, m3
15194pmaddubsw m5, m2, [r5 + 18 * 16]
15195pmulhrsw m5, m3
15196packuswb m4, m5
15197movu [r0 + 374 * 16], m4
15198
15199; mode 25 [row 7]
15200pmaddubsw m4, m1, [r5 + 16 * 16]
15201pmulhrsw m4, m3
15202pmaddubsw m5, m2, [r5 + 16 * 16]
15203pmulhrsw m5, m3
15204packuswb m4, m5
15205movu [r0 + 375 * 16], m4
15206
15207; mode 25 [row 8]
15208pmaddubsw m4, m1, [r5 + 14 * 16]
15209pmulhrsw m4, m3
15210pmaddubsw m5, m2, [r5 + 14 * 16]
15211pmulhrsw m5, m3
15212packuswb m4, m5
15213movu [r0 + 376 * 16], m4
15214
15215; mode 25 [row 9]
15216pmaddubsw m4, m1, [r5 + 12 * 16]
15217pmulhrsw m4, m3
15218pmaddubsw m5, m2, [r5 + 12 * 16]
15219pmulhrsw m5, m3
15220packuswb m4, m5
15221movu [r0 + 377 * 16], m4
15222
15223; mode 25 [row 10]
15224pmaddubsw m4, m1, [r5 + 10 * 16]
15225pmulhrsw m4, m3
15226pmaddubsw m5, m2, [r5 + 10 * 16]
15227pmulhrsw m5, m3
15228packuswb m4, m5
15229movu [r0 + 378 * 16], m4
15230
15231; mode 25 [row 11]
15232pmaddubsw m4, m1, [r5 + 8 * 16]
15233pmulhrsw m4, m3
15234pmaddubsw m5, m2, [r5 + 8 * 16]
15235pmulhrsw m5, m3
15236packuswb m4, m5
15237movu [r0 + 379 * 16], m4
15238
15239; mode 25 [row 12]
15240pmaddubsw m4, m1, [r5 + 6 * 16]
15241pmulhrsw m4, m3
15242pmaddubsw m5, m2, [r5 + 6 * 16]
15243pmulhrsw m5, m3
15244packuswb m4, m5
15245movu [r0 + 380 * 16], m4
15246
15247; mode 25 [row 13]
15248pmaddubsw m4, m1, [r5 + 4 * 16]
15249pmulhrsw m4, m3
15250pmaddubsw m5, m2, [r5 + 4 * 16]
15251pmulhrsw m5, m3
15252packuswb m4, m5
15253movu [r0 + 381 * 16], m4
15254
15255; mode 25 [row 14]
15256pmaddubsw m4, m1, [r5 + 2 * 16]
15257pmulhrsw m4, m3
15258pmaddubsw m5, m2, [r5 + 2 * 16]
15259pmulhrsw m5, m3
15260packuswb m4, m5
15261movu [r0 + 382 * 16], m4
15262
15263; mode 27 [row 15]
15264psrldq m6, m7, 1
15265punpcklbw m7, m6
15266pinsrb m6, [r1 + 17], 15
15267movu [r0 + 415 * 16], m6
15268
15269; mode 27 [row 0]
15270movu m4, [r1 + 9]
15271psrldq m5, m4, 1
15272punpcklbw m4, m5
15273pmaddubsw m6, m7, [r5 + 2 * 16]
15274pmulhrsw m6, m3
15275pmaddubsw m5, m4, [r5 + 2 * 16]
15276pmulhrsw m5, m3
15277packuswb m6, m5
15278movu [r0 + 400 * 16], m6
15279
15280; mode 27 [row 1]
15281pmaddubsw m6, m7, [r5 + 4 * 16]
15282pmulhrsw m6, m3
15283pmaddubsw m5, m4, [r5 + 4 * 16]
15284pmulhrsw m5, m3
15285packuswb m6, m5
15286movu [r0 + 401 * 16], m6
15287
15288; mode 27 [row 2]
15289pmaddubsw m6, m7, [r5 + 6 * 16]
15290pmulhrsw m6, m3
15291pmaddubsw m5, m4, [r5 + 6 * 16]
15292pmulhrsw m5, m3
15293packuswb m6, m5
15294movu [r0 + 402 * 16], m6
15295
15296; mode 27 [row 3]
15297pmaddubsw m6, m7, [r5 + 8 * 16]
15298pmulhrsw m6, m3
15299pmaddubsw m5, m4, [r5 + 8 * 16]
15300pmulhrsw m5, m3
15301packuswb m6, m5
15302movu [r0 + 403 * 16], m6
15303
15304; mode 27 [row 4]
15305pmaddubsw m6, m7, [r5 + 10 * 16]
15306pmulhrsw m6, m3
15307pmaddubsw m5, m4, [r5 + 10 * 16]
15308pmulhrsw m5, m3
15309packuswb m6, m5
15310movu [r0 + 404 * 16], m6
15311
15312; mode 27 [row 5]
15313pmaddubsw m6, m7, [r5 + 12 * 16]
15314pmulhrsw m6, m3
15315pmaddubsw m5, m4, [r5 + 12 * 16]
15316pmulhrsw m5, m3
15317packuswb m6, m5
15318movu [r0 + 405 * 16], m6
15319
15320; mode 27 [row 6]
15321pmaddubsw m6, m7, [r5 + 14 * 16]
15322pmulhrsw m6, m3
15323pmaddubsw m5, m4, [r5 + 14 * 16]
15324pmulhrsw m5, m3
15325packuswb m6, m5
15326movu [r0 + 406 * 16], m6
15327
15328; mode 27 [row 7]
15329pmaddubsw m6, m7, [r5 + 16 * 16]
15330pmulhrsw m6, m3
15331pmaddubsw m5, m4, [r5 + 16 * 16]
15332pmulhrsw m5, m3
15333packuswb m6, m5
15334movu [r0 + 407 * 16], m6
15335
15336; mode 27 [row 8]
15337pmaddubsw m6, m7, [r5 + 18 * 16]
15338pmulhrsw m6, m3
15339pmaddubsw m5, m4, [r5 + 18 * 16]
15340pmulhrsw m5, m3
15341packuswb m6, m5
15342movu [r0 + 408 * 16], m6
15343
15344; mode 27 [row 9]
15345pmaddubsw m6, m7, [r5 + 20 * 16]
15346pmulhrsw m6, m3
15347pmaddubsw m5, m4, [r5 + 20 * 16]
15348pmulhrsw m5, m3
15349packuswb m6, m5
15350movu [r0 + 409 * 16], m6
15351
15352; mode 27 [row 10]
15353pmaddubsw m6, m7, [r5 + 22 * 16]
15354pmulhrsw m6, m3
15355pmaddubsw m5, m4, [r5 + 22 * 16]
15356pmulhrsw m5, m3
15357packuswb m6, m5
15358movu [r0 + 410 * 16], m6
15359
15360; mode 27 [row 11]
15361pmaddubsw m6, m7, [r5 + 24 * 16]
15362pmulhrsw m6, m3
15363pmaddubsw m5, m4, [r5 + 24 * 16]
15364pmulhrsw m5, m3
15365packuswb m6, m5
15366movu [r0 + 411 * 16], m6
15367
15368; mode 27 [row 12]
15369pmaddubsw m6, m7, [r5 + 26 * 16]
15370pmulhrsw m6, m3
15371pmaddubsw m5, m4, [r5 + 26 * 16]
15372pmulhrsw m5, m3
15373packuswb m6, m5
15374movu [r0 + 412 * 16], m6
15375
15376; mode 27 [row 13]
15377pmaddubsw m6, m7, [r5 + 28 * 16]
15378pmulhrsw m6, m3
15379pmaddubsw m5, m4, [r5 + 28 * 16]
15380pmulhrsw m5, m3
15381packuswb m6, m5
15382movu [r0 + 413 * 16], m6
15383
15384; mode 27 [row 14]
15385pmaddubsw m6, m7, [r5 + 30 * 16]
15386pmulhrsw m6, m3
15387pmaddubsw m5, m4, [r5 + 30 * 16]
15388pmulhrsw m5, m3
15389packuswb m6, m5
15390movu [r0 + 414 * 16], m6
15391
15392; mode 28 [row 0]
15393movu m1, [r3 + 1]
15394psrldq m2, m1, 1
15395punpcklbw m1, m2
15396movu m4, [r3 + 9]
15397psrldq m5, m4, 1
15398punpcklbw m4, m5
15399pmaddubsw m2, m1, [r5 + 5 * 16]
15400pmulhrsw m2, m3
15401pmaddubsw m5, m4, [r5 + 5 * 16]
15402pmulhrsw m5, m3
15403packuswb m2, m5
15404movu [r0 + 416 * 16], m2
15405
15406; mode 28 [row 0]
15407pmaddubsw m2, m1, [r5 + 5 * 16]
15408pmulhrsw m2, m3
15409pmaddubsw m5, m4, [r5 + 5 * 16]
15410pmulhrsw m5, m3
15411packuswb m2, m5
15412movu [r0 + 416 * 16], m2
15413
15414; mode 28 [row 1]
15415pmaddubsw m2, m1, [r5 + 10 * 16]
15416pmulhrsw m2, m3
15417pmaddubsw m5, m4, [r5 + 10 * 16]
15418pmulhrsw m5, m3
15419packuswb m2, m5
15420movu [r0 + 417 * 16], m2
15421
15422; mode 28 [row 2]
15423pmaddubsw m2, m1, [r5 + 15 * 16]
15424pmulhrsw m2, m3
15425pmaddubsw m5, m4, [r5 + 15 * 16]
15426pmulhrsw m5, m3
15427packuswb m2, m5
15428movu [r0 + 418 * 16], m2
15429
15430; mode 28 [row 3]
15431pmaddubsw m2, m1, [r5 + 20 * 16]
15432pmulhrsw m2, m3
15433pmaddubsw m5, m4, [r5 + 20 * 16]
15434pmulhrsw m5, m3
15435packuswb m2, m5
15436movu [r0 + 419 * 16], m2
15437
15438; mode 28 [row 4]
15439pmaddubsw m2, m1, [r5 + 25 * 16]
15440pmulhrsw m2, m3
15441pmaddubsw m5, m4, [r5 + 25 * 16]
15442pmulhrsw m5, m3
15443packuswb m2, m5
15444movu [r0 + 420 * 16], m2
15445
15446; mode 28 [row 5]
15447pmaddubsw m2, m1, [r5 + 30 * 16]
15448pmulhrsw m2, m3
15449pmaddubsw m5, m4, [r5 + 30 * 16]
15450pmulhrsw m5, m3
15451packuswb m2, m5
15452movu [r0 + 421 * 16], m2
15453
15454; mode 29 [row 0]
15455pmaddubsw m2, m1, [r5 + 9 * 16]
15456pmulhrsw m2, m3
15457pmaddubsw m5, m4, [r5 + 9 * 16]
15458pmulhrsw m5, m3
15459packuswb m2, m5
15460movu [r0 + 432 * 16], m2
15461
15462; mode 29 [row 1]
15463pmaddubsw m2, m1, [r5 + 18 * 16]
15464pmulhrsw m2, m3
15465pmaddubsw m5, m4, [r5 + 18 * 16]
15466pmulhrsw m5, m3
15467packuswb m2, m5
15468movu [r0 + 433 * 16], m2
15469
15470; mode 29 [row 2]
15471pmaddubsw m2, m1, [r5 + 27 * 16]
15472pmulhrsw m2, m3
15473pmaddubsw m5, m4, [r5 + 27 * 16]
15474pmulhrsw m5, m3
15475packuswb m2, m5
15476movu [r0 + 434 * 16], m2
15477
15478; mode 30 [row 0]
15479pmaddubsw m2, m1, [r5 + 13 * 16]
15480pmulhrsw m2, m3
15481pmaddubsw m5, m4, [r5 + 13 * 16]
15482pmulhrsw m5, m3
15483packuswb m2, m5
15484movu [r0 + 448 * 16], m2
15485
15486; mode 30 [row 1]
15487pmaddubsw m2, m1, [r5 + 26 * 16]
15488pmulhrsw m2, m3
15489pmaddubsw m5, m4, [r5 + 26 * 16]
15490pmulhrsw m5, m3
15491packuswb m2, m5
15492movu [r0 + 449 * 16], m2
15493
15494; mode 33 [row 0]
15495movu [r0 + 496 * 16], m2
15496
15497; mode 31 [row 0]
15498pmaddubsw m2, m1, [r5 + 17 * 16]
15499pmulhrsw m2, m3
15500pmaddubsw m5, m4, [r5 + 17 * 16]
15501pmulhrsw m5, m3
15502packuswb m2, m5
15503movu [r0 + 464 * 16], m2
15504
15505; mode 32 [row 0]
15506pmaddubsw m2, m1, [r5 + 21 * 16]
15507pmulhrsw m2, m3
15508pmaddubsw m5, m4, [r5 + 21 * 16]
15509pmulhrsw m5, m3
15510packuswb m2, m5
15511movu [r0 + 480 * 16], m2
15512
15513; mode 28 [row 6]
15514movd m7, [r3 + 9]
15515palignr m7, m1, 2
15516pmaddubsw m2, m7, [r5 + 3 * 16]
15517pmulhrsw m2, m3
15518movd m6, [r3 + 17]
15519palignr m6, m4, 2
15520pmaddubsw m5, m6, [r5 + 3 * 16]
15521pmulhrsw m5, m3
15522packuswb m2, m5
15523movu [r0 + 422 * 16], m2
15524
15525; mode 28 [row 7]
15526pmaddubsw m2, m7, [r5 + 8 * 16]
15527pmulhrsw m2, m3
15528pmaddubsw m5, m6, [r5 + 8 * 16]
15529pmulhrsw m5, m3
15530packuswb m2, m5
15531movu [r0 + 423 * 16], m2
15532
15533; mode 28 [row 8]
15534pmaddubsw m2, m7, [r5 + 13 * 16]
15535pmulhrsw m2, m3
15536pmaddubsw m5, m6, [r5 + 13 * 16]
15537pmulhrsw m5, m3
15538packuswb m2, m5
15539movu [r0 + 424 * 16], m2
15540
15541; mode 28 [row 9]
15542pmaddubsw m2, m7, [r5 + 18 * 16]
15543pmulhrsw m2, m3
15544pmaddubsw m5, m6, [r5 + 18 * 16]
15545pmulhrsw m5, m3
15546packuswb m2, m5
15547movu [r0 + 425 * 16], m2
15548
15549; mode 28 [row 10]
15550pmaddubsw m2, m7, [r5 + 23 * 16]
15551pmulhrsw m2, m3
15552pmaddubsw m5, m6, [r5 + 23 * 16]
15553pmulhrsw m5, m3
15554packuswb m2, m5
15555movu [r0 + 426 * 16], m2
15556
15557; mode 29 [row 3]
15558pmaddubsw m2, m7, [r5 + 4 * 16]
15559pmulhrsw m2, m3
15560pmaddubsw m5, m6, [r5 + 4 * 16]
15561pmulhrsw m5, m3
15562packuswb m2, m5
15563movu [r0 + 435 * 16], m2
15564
15565; mode 29 [row 4]
15566pmaddubsw m2, m7, [r5 + 13 * 16]
15567pmulhrsw m2, m3
15568pmaddubsw m5, m6, [r5 + 13 * 16]
15569pmulhrsw m5, m3
15570packuswb m2, m5
15571movu [r0 + 436 * 16], m2
15572
15573; mode 29 [row 5]
15574pmaddubsw m2, m7, [r5 + 22 * 16]
15575pmulhrsw m2, m3
15576pmaddubsw m5, m6, [r5 + 22 * 16]
15577pmulhrsw m5, m3
15578packuswb m2, m5
15579movu [r0 + 437 * 16], m2
15580
15581; mode 29 [row 6]
15582pmaddubsw m2, m7, [r5 + 31 * 16]
15583pmulhrsw m2, m3
15584pmaddubsw m5, m6, [r5 + 31 * 16]
15585pmulhrsw m5, m3
15586packuswb m2, m5
15587movu [r0 + 438 * 16], m2
15588
15589; mode 32 [row 2]
15590movu [r0 + 482 * 16], m2
15591
15592; mode 30 [row 2]
15593pmaddubsw m2, m7, [r5 + 7 * 16]
15594pmulhrsw m2, m3
15595pmaddubsw m5, m6, [r5 + 7 * 16]
15596pmulhrsw m5, m3
15597packuswb m2, m5
15598movu [r0 + 450 * 16], m2
15599
15600; mode 30 [row 3]
15601pmaddubsw m2, m7, [r5 + 20 * 16]
15602pmulhrsw m2, m3
15603pmaddubsw m5, m6, [r5 + 20 * 16]
15604pmulhrsw m5, m3
15605packuswb m2, m5
15606movu [r0 + 451 * 16], m2
15607
15608; mode 33 [row 1]
15609movu [r0 + 497 * 16], m2
15610
15611; mode 31 [row 1]
15612pmaddubsw m2, m7, [r5 + 2 * 16]
15613pmulhrsw m2, m3
15614pmaddubsw m5, m6, [r5 + 2 * 16]
15615pmulhrsw m5, m3
15616packuswb m2, m5
15617movu [r0 + 465 * 16], m2
15618
15619; mode 31 [row 2]
15620pmaddubsw m2, m7, [r5 + 19 * 16]
15621pmulhrsw m2, m3
15622pmaddubsw m5, m6, [r5 + 19 * 16]
15623pmulhrsw m5, m3
15624packuswb m2, m5
15625movu [r0 + 466 * 16], m2
15626
15627; mode 32 [row 1]
15628pmaddubsw m2, m7, [r5 + 10 * 16]
15629pmulhrsw m2, m3
15630pmaddubsw m5, m6, [r5 + 10 * 16]
15631pmulhrsw m5, m3
15632packuswb m2, m5
15633movu [r0 + 481 * 16], m2
15634
15635; mode 28 [row 11]
15636pmaddubsw m2, m7, [r5 + 28 * 16]
15637pmulhrsw m2, m3
15638pmaddubsw m5, m6, [r5 + 28 * 16]
15639pmulhrsw m5, m3
15640packuswb m2, m5
15641movu [r0 + 427 * 16], m2
15642
15643; mode 28 [row 12]
15644movd m1, [r3 + 10]
15645palignr m1, m7, 2
15646pmaddubsw m2, m1, [r5 + 1 * 16]
15647pmulhrsw m2, m3
15648movd m4, [r3 + 18]
15649palignr m4, m6, 2
15650pmaddubsw m5, m4, [r5 + 1 * 16]
15651pmulhrsw m5, m3
15652packuswb m2, m5
15653movu [r0 + 428 * 16], m2
15654
15655; mode 30 [row 4]
15656movu [r0 + 452 * 16], m2
15657
15658; mode 28 [row 13]
15659pmaddubsw m2, m1, [r5 + 6 * 16]
15660pmulhrsw m2, m3
15661pmaddubsw m5, m4, [r5 + 6 * 16]
15662pmulhrsw m5, m3
15663packuswb m2, m5
15664movu [r0 + 429 * 16], m2
15665
15666; mode 28 [row 14]
15667pmaddubsw m2, m1, [r5 + 11 * 16]
15668pmulhrsw m2, m3
15669pmaddubsw m5, m4, [r5 + 11 * 16]
15670pmulhrsw m5, m3
15671packuswb m2, m5
15672movu [r0 + 430 * 16], m2
15673
15674; mode 28 [row 15]
15675pmaddubsw m2, m1, [r5 + 16 * 16]
15676pmulhrsw m2, m3
15677pmaddubsw m5, m4, [r5 + 16 * 16]
15678pmulhrsw m5, m3
15679packuswb m2, m5
15680movu [r0 + 431 * 16], m2
15681
15682; mode 29 [row 7]
15683pmaddubsw m2, m1, [r5 + 8 * 16]
15684pmulhrsw m2, m3
15685pmaddubsw m5, m4, [r5 + 8 * 16]
15686pmulhrsw m5, m3
15687packuswb m2, m5
15688movu [r0 + 439 * 16], m2
15689
15690; mode 29 [row 8]
15691pmaddubsw m2, m1, [r5 + 17 * 16]
15692pmulhrsw m2, m3
15693pmaddubsw m5, m4, [r5 + 17 * 16]
15694pmulhrsw m5, m3
15695packuswb m2, m5
15696movu [r0 + 440 * 16], m2
15697
15698; mode 29 [row 9]
15699pmaddubsw m2, m1, [r5 + 26 * 16]
15700pmulhrsw m2, m3
15701pmaddubsw m5, m4, [r5 + 26 * 16]
15702pmulhrsw m5, m3
15703packuswb m2, m5
15704movu [r0 + 441 * 16], m2
15705
15706; mode 30 [row 5]
15707pmaddubsw m2, m1, [r5 + 14 * 16]
15708pmulhrsw m2, m3
15709pmaddubsw m5, m4, [r5 + 14 * 16]
15710pmulhrsw m5, m3
15711packuswb m2, m5
15712movu [r0 + 453 * 16], m2
15713
15714; mode 33 [row 2]
15715movu [r0 + 498 * 16], m2
15716
15717; mode 30 [row 6]
15718pmaddubsw m2, m1, [r5 + 27 * 16]
15719pmulhrsw m2, m3
15720pmaddubsw m5, m4, [r5 + 27 * 16]
15721pmulhrsw m5, m3
15722packuswb m2, m5
15723movu [r0 + 454 * 16], m2
15724
15725; mode 31 [row 3]
15726pmaddubsw m2, m1, [r5 + 4 * 16]
15727pmulhrsw m2, m3
15728pmaddubsw m5, m4, [r5 + 4 * 16]
15729pmulhrsw m5, m3
15730packuswb m2, m5
15731movu [r0 + 467 * 16], m2
15732
15733; mode 31 [row 4]
15734pmaddubsw m2, m1, [r5 + 21 * 16]
15735pmulhrsw m2, m3
15736pmaddubsw m5, m4, [r5 + 21 * 16]
15737pmulhrsw m5, m3
15738packuswb m2, m5
15739movu [r0 + 468 * 16], m2
15740
15741; mode 32 [row 3]
15742pmaddubsw m2, m1, [r5 + 20 * 16]
15743pmulhrsw m2, m3
15744pmaddubsw m5, m4, [r5 + 20 * 16]
15745pmulhrsw m5, m3
15746packuswb m2, m5
15747movu [r0 + 483 * 16], m2
15748
15749; mode 29 [row 10]
15750movd m7, [r3 + 11]
15751palignr m7, m1, 2
15752pmaddubsw m2, m7, [r5 + 3 * 16]
15753pmulhrsw m2, m3
15754movd m6, [r3 + 19]
15755palignr m6, m4, 2
15756pmaddubsw m5, m6, [r5 + 3 * 16]
15757pmulhrsw m5, m3
15758packuswb m2, m5
15759movu [r0 + 442 * 16], m2
15760
15761; mode 29 [row 11]
15762pmaddubsw m2, m7, [r5 + 12 * 16]
15763pmulhrsw m2, m3
15764pmaddubsw m5, m6, [r5 + 12 * 16]
15765pmulhrsw m5, m3
15766packuswb m2, m5
15767movu [r0 + 443 * 16], m2
15768
15769; mode 29 [row 12]
15770pmaddubsw m2, m7, [r5 + 21 * 16]
15771pmulhrsw m2, m3
15772pmaddubsw m5, m6, [r5 + 21 * 16]
15773pmulhrsw m5, m3
15774packuswb m2, m5
15775movu [r0 + 444 * 16], m2
15776
15777; mode 30 [row 8]
15778movu [r0 + 456 * 16], m2
15779
15780; mode 29 [row 13]
15781pmaddubsw m2, m7, [r5 + 30 * 16]
15782pmulhrsw m2, m3
15783pmaddubsw m5, m6, [r5 + 30 * 16]
15784pmulhrsw m5, m3
15785packuswb m2, m5
15786movu [r0 + 445 * 16], m2
15787
15788; mode 32 [row 5]
15789movu [r0 + 485 * 16], m2
15790
15791; mode 30 [row 7]
15792pmaddubsw m2, m7, [r5 + 8 * 16]
15793pmulhrsw m2, m3
15794pmaddubsw m5, m6, [r5 + 8 * 16]
15795pmulhrsw m5, m3
15796packuswb m2, m5
15797movu [r0 + 455 * 16], m2
15798
15799; mode 33 [row 3]
15800movu [r0 + 499 * 16], m2
15801
15802; mode 31 [row 5]
15803pmaddubsw m2, m7, [r5 + 6 * 16]
15804pmulhrsw m2, m3
15805pmaddubsw m5, m6, [r5 + 6 * 16]
15806pmulhrsw m5, m3
15807packuswb m2, m5
15808movu [r0 + 469 * 16], m2
15809
15810; mode 31 [row 6]
15811pmaddubsw m2, m7, [r5 + 23 * 16]
15812pmulhrsw m2, m3
15813pmaddubsw m5, m6, [r5 + 23 * 16]
15814pmulhrsw m5, m3
15815packuswb m2, m5
15816movu [r0 + 470 * 16], m2
15817
15818; mode 32 [row 4]
15819pmaddubsw m2, m7, [r5 + 9 * 16]
15820pmulhrsw m2, m3
15821pmaddubsw m5, m6, [r5 + 9 * 16]
15822pmulhrsw m5, m3
15823packuswb m2, m5
15824movu [r0 + 484 * 16], m2
15825
15826movu m1, m7
15827movu m4, m6
15828
15829; mode 29 [row 14]
15830movu m1, [r3 + 12]
15831palignr m1, m7, 2
15832pmaddubsw m2, m1, [r5 + 7 * 16]
15833pmulhrsw m2, m3
15834movd m4, [r3 + 20]
15835palignr m4, m6, 2
15836pmaddubsw m5, m4, [r5 + 7 * 16]
15837pmulhrsw m5, m3
15838packuswb m2, m5
15839movu [r0 + 446 * 16], m2
15840
15841; mode 29 [row 15]
15842pmaddubsw m2, m1, [r5 + 16 * 16]
15843pmulhrsw m2, m3
15844pmaddubsw m5, m4, [r5 + 16 * 16]
15845pmulhrsw m5, m3
15846packuswb m2, m5
15847movu [r0 + 447 * 16], m2
15848
15849; mode 30 [row 9]
15850pmaddubsw m2, m1, [r5 + 2 * 16]
15851pmulhrsw m2, m3
15852pmaddubsw m5, m4, [r5 + 2 * 16]
15853pmulhrsw m5, m3
15854packuswb m2, m5
15855movu [r0 + 457 * 16], m2
15856
15857; mode 33 [row 4]
15858movu [r0 + 500 * 16], m2
15859
15860; mode 30 [row 10]
15861pmaddubsw m2, m1, [r5 + 15 * 16]
15862pmulhrsw m2, m3
15863pmaddubsw m5, m4, [r5 + 15 * 16]
15864pmulhrsw m5, m3
15865packuswb m2, m5
15866movu [r0 + 458 * 16], m2
15867
15868; mode 30 [row 11]
15869pmaddubsw m2, m1, [r5 + 28 * 16]
15870pmulhrsw m2, m3
15871pmaddubsw m5, m4, [r5 + 28 * 16]
15872pmulhrsw m5, m3
15873packuswb m2, m5
15874movu [r0 + 459 * 16], m2
15875
15876; mode 33 [row 5]
15877movu [r0 + 501 * 16], m2
15878
15879; mode 31 [row 7]
15880pmaddubsw m2, m1, [r5 + 8 * 16]
15881pmulhrsw m2, m3
15882pmaddubsw m5, m4, [r5 + 8 * 16]
15883pmulhrsw m5, m3
15884packuswb m2, m5
15885movu [r0 + 471 * 16], m2
15886
15887; mode 31 [row 8]
15888pmaddubsw m2, m1, [r5 + 25 * 16]
15889pmulhrsw m2, m3
15890pmaddubsw m5, m4, [r5 + 25 * 16]
15891pmulhrsw m5, m3
15892packuswb m2, m5
15893movu [r0 + 472 * 16], m2
15894
15895; mode 32 [row 6]
15896pmaddubsw m2, m1, [r5 + 19 * 16]
15897pmulhrsw m2, m3
15898pmaddubsw m5, m4, [r5 + 19 * 16]
15899pmulhrsw m5, m3
15900packuswb m2, m5
15901movu [r0 + 486 * 16], m2
15902
15903; mode 30 [row 12]
15904movd m7, [r3 + 13]
15905palignr m7, m1, 2
15906pmaddubsw m2, m7, [r5 + 9 * 16]
15907pmulhrsw m2, m3
15908movd m6, [r3 + 21]
15909palignr m6, m4, 2
15910pmaddubsw m5, m6, [r5 + 9 * 16]
15911pmulhrsw m5, m3
15912packuswb m2, m5
15913movu [r0 + 460 * 16], m2
15914
15915; mode 30 [row 13]
15916pmaddubsw m2, m7, [r5 + 22 * 16]
15917pmulhrsw m2, m3
15918pmaddubsw m5, m6, [r5 + 22 * 16]
15919pmulhrsw m5, m3
15920packuswb m2, m5
15921movu [r0 + 461 * 16], m2
15922
15923; mode 33 [row 6]
15924movu [r0 + 502 * 16], m2
15925
15926; mode 31 [row 9]
15927pmaddubsw m2, m7, [r5 + 10 * 16]
15928pmulhrsw m2, m3
15929pmaddubsw m5, m6, [r5 + 10 * 16]
15930pmulhrsw m5, m3
15931packuswb m2, m5
15932movu [r0 + 473 * 16], m2
15933
15934; mode 31 [row 10]
15935pmaddubsw m2, m7, [r5 + 27 * 16]
15936pmulhrsw m2, m3
15937pmaddubsw m5, m6, [r5 + 27 * 16]
15938pmulhrsw m5, m3
15939packuswb m2, m5
15940movu [r0 + 474 * 16], m2
15941
15942; mode 32 [row 7]
15943pmaddubsw m2, m7, [r5 + 8 * 16]
15944pmulhrsw m2, m3
15945pmaddubsw m5, m6, [r5 + 8 * 16]
15946pmulhrsw m5, m3
15947packuswb m2, m5
15948movu [r0 + 487 * 16], m2
15949
15950; mode 32 [row 8]
15951pmaddubsw m2, m7, [r5 + 29 * 16]
15952pmulhrsw m2, m3
15953pmaddubsw m5, m6, [r5 + 29 * 16]
15954pmulhrsw m5, m3
15955packuswb m2, m5
15956movu [r0 + 488 * 16], m2
15957
15958
15959movu m1, m7
15960movu m4, m6
15961
15962; mode 30 [row 14]
15963movd m1, [r3 + 14]
15964palignr m1, m7, 2
15965pmaddubsw m2, m1, [r5 + 3 * 16]
15966pmulhrsw m2, m3
15967movd m4, [r3 + 22]
15968palignr m4, m6, 2
15969pmaddubsw m5, m4, [r5 + 3 * 16]
15970pmulhrsw m5, m3
15971packuswb m2, m5
15972movu [r0 + 462 * 16], m2
15973
15974; mode 30 [row 15]
15975pmaddubsw m2, m1, [r5 + 16 * 16]
15976pmulhrsw m2, m3
15977pmaddubsw m5, m4, [r5 + 16 * 16]
15978pmulhrsw m5, m3
15979packuswb m2, m5
15980movu [r0 + 463 * 16], m2
15981
15982; mode 33 [row 7]
15983movu [r0 + 503 * 16], m2
15984
15985; mode 31 [row 11]
15986pmaddubsw m2, m1, [r5 + 12 * 16]
15987pmulhrsw m2, m3
15988pmaddubsw m5, m4, [r5 + 12 * 16]
15989pmulhrsw m5, m3
15990packuswb m2, m5
15991movu [r0 + 475 * 16], m2
15992
15993; mode 31 [row 12]
15994pmaddubsw m2, m1, [r5 + 29 * 16]
15995pmulhrsw m2, m3
15996pmaddubsw m5, m4, [r5 + 29 * 16]
15997pmulhrsw m5, m3
15998packuswb m2, m5
15999movu [r0 + 476 * 16], m2
16000
16001; mode 32 [row 9]
16002pmaddubsw m2, m1, [r5 + 18 * 16]
16003pmulhrsw m2, m3
16004pmaddubsw m5, m4, [r5 + 18 * 16]
16005pmulhrsw m5, m3
16006packuswb m2, m5
16007movu [r0 + 489 * 16], m2
16008
16009; mode 31 [row 13]
16010movd m7, [r3 + 15]
16011palignr m7, m1, 2
16012pmaddubsw m2, m7, [r5 + 14 * 16]
16013pmulhrsw m2, m3
16014movd m6, [r3 + 23]
16015palignr m6, m4, 2
16016pmaddubsw m5, m6, [r5 + 14 * 16]
16017pmulhrsw m5, m3
16018packuswb m2, m5
16019movu [r0 + 477 * 16], m2
16020
16021; mode 31 [row 14]
16022pmaddubsw m2, m7, [r5 + 31 * 16]
16023pmulhrsw m2, m3
16024pmaddubsw m5, m6, [r5 + 31 * 16]
16025pmulhrsw m5, m3
16026packuswb m2, m5
16027movu [r0 + 478 * 16], m2
16028
16029; mode 32 [row 10]
16030pmaddubsw m2, m7, [r5 + 7 * 16]
16031pmulhrsw m2, m3
16032pmaddubsw m5, m6, [r5 + 7 * 16]
16033pmulhrsw m5, m3
16034packuswb m2, m5
16035movu [r0 + 490 * 16], m2
16036
16037; mode 32 [row 11]
16038pmaddubsw m2, m7, [r5 + 28 * 16]
16039pmulhrsw m2, m3
16040pmaddubsw m5, m6, [r5 + 28 * 16]
16041pmulhrsw m5, m3
16042packuswb m2, m5
16043movu [r0 + 491 * 16], m2
16044
16045; mode 33 [row 8]
16046pmaddubsw m2, m7, [r5 + 10 * 16]
16047pmulhrsw m2, m3
16048pmaddubsw m5, m6, [r5 + 10 * 16]
16049pmulhrsw m5, m3
16050packuswb m2, m5
16051movu [r0 + 504 * 16], m2
16052
16053; mode 31 [row 15]
16054movd m1, [r3 + 16]
16055palignr m1, m7, 2
16056pmaddubsw m2, m1, [r5 + 16 * 16]
16057pmulhrsw m2, m3
16058movd m4, [r3 + 24]
16059palignr m4, m6, 2
16060pmaddubsw m5, m4, [r5 + 16 * 16]
16061pmulhrsw m5, m3
16062packuswb m2, m5
16063movu [r0 + 479 * 16], m2
16064
16065; mode 32 [row 12]
16066pmaddubsw m2, m1, [r5 + 17 * 16]
16067pmulhrsw m2, m3
16068pmaddubsw m5, m4, [r5 + 17 * 16]
16069pmulhrsw m5, m3
16070packuswb m2, m5
16071movu [r0 + 492 * 16], m2
16072
16073; mode 33 [row 9]
16074pmaddubsw m2, m1, [r5 + 4 * 16]
16075pmulhrsw m2, m3
16076pmaddubsw m5, m4, [r5 + 4 * 16]
16077pmulhrsw m5, m3
16078packuswb m2, m5
16079movu [r0 + 505 * 16], m2
16080
16081; mode 33 [row 10]
16082pmaddubsw m2, m1, [r5 + 30 * 16]
16083pmulhrsw m2, m3
16084pmaddubsw m5, m4, [r5 + 30 * 16]
16085pmulhrsw m5, m3
16086packuswb m2, m5
16087movu [r0 + 506 * 16], m2
16088
16089; mode 33 [row 10]
16090pmaddubsw m2, m1, [r5 + 4 * 16]
16091pmulhrsw m2, m3
16092pmaddubsw m5, m4, [r5 + 4 * 16]
16093pmulhrsw m5, m3
16094packuswb m2, m5
16095movu [r0 + 505 * 16], m2
16096
16097; mode 32 [row 13]
16098movd m7, [r3 + 17]
16099palignr m7, m1, 2
16100pmaddubsw m2, m7, [r5 + 6 * 16]
16101pmulhrsw m2, m3
16102
16103movd m6, [r3 + 25]
16104palignr m6, m4, 2
16105pmaddubsw m5, m6, [r5 + 6 * 16]
16106pmulhrsw m5, m3
16107packuswb m2, m5
16108movu [r0 + 493 * 16], m2
16109
16110; mode 32 [row 14]
16111pmaddubsw m2, m7, [r5 + 27 * 16]
16112pmulhrsw m2, m3
16113pmaddubsw m5, m6, [r5 + 27 * 16]
16114pmulhrsw m5, m3
16115packuswb m2, m5
16116movu [r0 + 494 * 16], m2
16117
16118; mode 33 [row 11]
16119pmaddubsw m2, m7, [r5 + 24 * 16]
16120pmulhrsw m2, m3
16121pmaddubsw m5, m6, [r5 + 24 * 16]
16122pmulhrsw m5, m3
16123packuswb m2, m5
16124movu [r0 + 507 * 16], m2
16125
16126; mode 32 [row 15]
16127movd m1, [r3 + 18]
16128palignr m1, m7, 2
16129pmaddubsw m2, m1, [r5 + 16 * 16]
16130pmulhrsw m2, m3
16131psrldq m4, 2
16132pinsrb m4, [r3 + 26], 14
16133pinsrb m4, [r3 + 27], 15
16134movd m4, [r3 + 26]
16135palignr m4, m6, 2
16136pmaddubsw m5, m4, [r5 + 16 * 16]
16137pmulhrsw m5, m3
16138packuswb m2, m5
16139movu [r0 + 495 * 16], m2
16140
16141; mode 33 [row 12]
16142pmaddubsw m2, m1, [r5 + 18 * 16]
16143pmulhrsw m2, m3
16144pmaddubsw m5, m4, [r5 + 18 * 16]
16145pmulhrsw m5, m3
16146packuswb m2, m5
16147movu [r0 + 508 * 16], m2
16148
16149; mode 33 [row 13]
16150movd m7, [r3 + 19]
16151palignr m7, m1, 2
16152pmaddubsw m2, m7, [r5 + 12 * 16]
16153pmulhrsw m2, m3
16154movd m6, [r3 + 27]
16155palignr m6, m4, 2
16156pmaddubsw m5, m6, [r5 + 12 * 16]
16157pmulhrsw m5, m3
16158packuswb m2, m5
16159movu [r0 + 509 * 16], m2
16160
16161; mode 33 [row 14]
16162movd m1, [r3 + 20]
16163palignr m1, m7, 2
16164pmaddubsw m2, m1, [r5 + 6 * 16]
16165pmulhrsw m2, m3
16166movd m4, [r3 + 28]
16167palignr m4, m6, 2
16168pmaddubsw m5, m4, [r5 + 6 * 16]
16169pmulhrsw m5, m3
16170packuswb m2, m5
16171movu [r0 + 510 * 16], m2
16172
16173; mode 34 [row 0]
16174movu m1, [r3 + 2]
16175movu [r0 + 512 * 16], m1
16176movu m2, [r3 + 18]
16177palignr m3, m2, m1, 1
16178movu [r0 + 513 * 16], m3
16179palignr m3, m2, m1, 2
16180movu [r0 + 514 * 16], m3
16181palignr m3, m2, m1, 3
16182movu [r0 + 515 * 16], m3
16183palignr m3, m2, m1, 4
16184movu [r0 + 516 * 16], m3
16185palignr m3, m2, m1, 5
16186movu [r0 + 517 * 16], m3
16187palignr m3, m2, m1, 6
16188movu [r0 + 518 * 16], m3
16189palignr m3, m2, m1, 7
16190movu [r0 + 519 * 16], m3
16191palignr m3, m2, m1, 8
16192movu [r0 + 520 * 16], m3
16193palignr m3, m2, m1, 9
16194movu [r0 + 521 * 16], m3
16195palignr m3, m2, m1, 10
16196movu [r0 + 522 * 16], m3
16197palignr m3, m2, m1, 11
16198movu [r0 + 523 * 16], m3
16199palignr m3, m2, m1, 12
16200movu [r0 + 524 * 16], m3
16201
16202; mode 33 [row 15]
16203movu [r0 + 511 * 16], m3
16204
16205; mode 34
16206palignr m3, m2, m1, 13
16207movu [r0 + 525 * 16], m3
16208palignr m3, m2, m1, 14
16209movu [r0 + 526 * 16], m3
16210palignr m3, m2, m1, 15
16211movu [r0 + 527 * 16], m3
16212
16213RET
16214
16215;-----------------------------------------------------------------------------
16216; void all_angs_pred_32x32(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
16217;-----------------------------------------------------------------------------
16218INIT_XMM sse4
16219cglobal all_angs_pred_32x32, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
16220
16221;mode 2[row 0]
16222movu m0, [r4 + 2]
16223movu [r0 + 0 * 16], m0
16224movu m1, [r4 + 18]
16225movu [r0 + 1 * 16], m1
16226
16227;mode 9 [row 15]
16228movu [r0 + 478 * 16], m0
16229movu [r0 + 479 * 16], m1
16230
16231;mode 2[row 1]
16232movu m2, [r4 + 34]
16233palignr m3, m1, m0, 1
16234movu [r0 + 2 * 16], m3
16235palignr m4, m2, m1, 1
16236movu [r0 + 3 * 16], m4
16237
16238; mode 9 [row 31]
16239movu [r0 + 510 * 16], m3
16240movu [r0 + 511 * 16], m4
16241
16242;mode 2[row 17]
16243movu [r0 + 34 * 16], m4
16244movu m5, [r4 + 35]
16245movu [r0 + 35 * 16], m5
16246
16247;mode 2[row 2]
16248palignr m3, m1, m0, 2
16249movu [r0 + 4 * 16], m3
16250palignr m4, m2, m1, 2
16251movu [r0 + 5 * 16], m4
16252
16253;mode 2[row 18]
16254movu [r0 + 36 * 16], m4
16255movu m6, [r4 + 51]
16256palignr m7, m6, m5, 1
16257movu [r0 + 37 * 16], m7
16258
16259;mode 2[row 3]
16260palignr m3, m1, m0, 3
16261movu [r0 + 6 * 16], m3
16262palignr m4, m2, m1, 3
16263movu [r0 + 7 * 16], m4
16264
16265;mode 2[row 19]
16266movu [r0 + 38 * 16], m4
16267palignr m7, m6, m5, 2
16268movu [r0 + 39 * 16], m7
16269
16270;mode 2[row 4]
16271palignr m3, m1, m0, 4
16272movu [r0 + 8 * 16], m3
16273palignr m4, m2, m1, 4
16274movu [r0 + 9 * 16], m4
16275
16276; mode 8 [row 31]
16277movu [r0 + 446 * 16], m3
16278movu [r0 + 447 * 16], m4
16279
16280;mode 2[row 20]
16281movu [r0 + 40 * 16], m4
16282palignr m7, m6, m5, 3
16283movu [r0 + 41 * 16], m7
16284
16285; mode 4 [row 31]
16286movu [r0 + 190 * 16], m4
16287movu [r0 + 191 * 16], m7
16288
16289;mode 2[row 5]
16290palignr m3, m1, m0, 5
16291movu [r0 + 10 * 16], m3
16292palignr m4, m2, m1, 5
16293movu [r0 + 11 * 16], m4
16294
16295;mode 2[row 21]
16296movu [r0 + 42 * 16], m4
16297palignr m7, m6, m5, 4
16298movu [r0 + 43 * 16], m7
16299
16300;mode 2[row 6]
16301palignr m3, m1, m0, 6
16302movu [r0 + 12 * 16], m3
16303palignr m4, m2, m1, 6
16304movu [r0 + 13 * 16], m4
16305
16306;mode 2[row 22]
16307movu [r0 + 44 * 16], m4
16308palignr m7, m6, m5, 5
16309movu [r0 + 45 * 16], m7
16310
16311;mode 2[row 7]
16312palignr m3, m1, m0, 7
16313movu [r0 + 14 * 16], m3
16314palignr m4, m2, m1, 7
16315movu [r0 + 15 * 16], m4
16316
16317;mode 2[row 23]
16318movu [r0 + 46 * 16], m4
16319palignr m7, m6, m5, 6
16320movu [r0 + 47 * 16], m7
16321
16322;mode 2[row 8]
16323palignr m3, m1, m0, 8
16324movu [r0 + 16 * 16], m3
16325palignr m4, m2, m1, 8
16326movu [r0 + 17 * 16], m4
16327
16328;mode 7[row 31]
16329movu [r0 + 382 * 16], m3
16330movu [r0 + 383 * 16], m4
16331
16332;mode 2[row 24]
16333movu [r0 + 48 * 16], m4
16334palignr m7, m6, m5, 7
16335movu [r0 + 49 * 16], m7
16336
16337;mode 2[row 9]
16338palignr m3, m1, m0, 9
16339movu [r0 + 18 * 16], m3
16340palignr m4, m2, m1, 9
16341movu [r0 + 19 * 16], m4
16342
16343;mode 2[row 25]
16344movu [r0 + 50 * 16], m4
16345palignr m7, m6, m5, 8
16346movu [r0 + 51 * 16], m7
16347
16348; mode 3 [row 31]
16349movu [r0 + 126 * 16], m4
16350movu [r0 + 127 * 16], m7
16351
16352;mode 2[row 10]
16353palignr m3, m1, m0, 10
16354movu [r0 + 20 * 16], m3
16355palignr m4, m2, m1, 10
16356movu [r0 + 21 * 16], m4
16357
16358;mode 2[row 26]
16359movu [r0 + 52 * 16], m4
16360palignr m7, m6, m5, 9
16361movu [r0 + 53 * 16], m7
16362
16363;mode 2[row 11]
16364palignr m3, m1, m0, 11
16365movu [r0 + 22 * 16], m3
16366palignr m4, m2, m1, 11
16367movu [r0 + 23 * 16], m4
16368
16369;mode 2[row 27]
16370movu [r0 + 54 * 16], m4
16371palignr m7, m6, m5, 10
16372movu [r0 + 55 * 16], m7
16373
16374;mode 2[row 12]
16375palignr m3, m1, m0, 12
16376movu [r0 + 24 * 16], m3
16377palignr m4, m2, m1, 12
16378movu [r0 + 25 * 16], m4
16379
16380; mode 6 [row 31]
16381movu [r0 + 318 * 16], m3
16382movu [r0 + 319 * 16], m4
16383
16384; mode 3 [row 15]
16385movu [r0 + 94 * 16], m3
16386movu [r0 + 95 * 16], m4
16387
16388;mode 2[row 28]
16389movu [r0 + 56 * 16], m4
16390palignr m7, m6, m5, 11
16391movu [r0 + 57 * 16], m7
16392
16393;mode 2[row 13]
16394palignr m3, m1, m0, 13
16395movu [r0 + 26 * 16], m3
16396palignr m4, m2, m1, 13
16397movu [r0 + 27 * 16], m4
16398
16399;mode 2[row 29]
16400movu [r0 + 58 * 16], m4
16401palignr m7, m6, m5, 12
16402movu [r0 + 59 * 16], m7
16403
16404;mode 2[row 14]
16405palignr m3, m1, m0, 14
16406movu [r0 + 28 * 16], m3
16407palignr m4, m2, m1, 14
16408movu [r0 + 29 * 16], m4
16409
16410;mode 2[row 30]
16411movu [r0 + 60 * 16], m4
16412palignr m7, m6, m5, 13
16413movu [r0 + 61 * 16], m7
16414
16415;mode 2[row 15]
16416palignr m3, m1, m0, 15
16417movu [r0 + 30 * 16], m3
16418palignr m4, m2, m1, 15
16419movu [r0 + 31 * 16], m4
16420
16421;mode 2[row 31]
16422movu [r0 + 62 * 16], m4
16423palignr m7, m6, m5, 14
16424movu [r0 + 63 * 16], m7
16425
16426;mode 2[row 16]
16427movu [r0 + 32 * 16], m1
16428movu [r0 + 33 * 16], m2
16429
16430; mode 5[row 31]
16431movu [r0 + 254 * 16], m1
16432movu [r0 + 255 * 16], m2
16433
16434; mode 3 [row 0]
16435lea r5, [ang_table]
16436movu m6, [r5 + 26 * 16]
16437movu m7, [pw_1024 ]
16438movu m1, [r4 + 1 ]
16439punpcklbw m1, m0
16440pmaddubsw m0, m1, m6
16441pmulhrsw m0, m7
16442movu m2, [r4 + 9]
16443movd m3, [r4 + 10]
16444palignr m3, m2, 1
16445punpcklbw m2, m3
16446pmaddubsw m3, m2, m6
16447pmulhrsw m3, m7
16448packuswb m0, m3
16449movu [r0 + 64 * 16], m0
16450
16451; mode 6 [row 1 - first half]
16452movu [r0 + 258 * 16], m0
16453
16454; mode 9 [row 12 - first half]
16455movu [r0 + 472 * 16], m0
16456
16457movu m0, [r4 + 17]
16458movd m3, [r4 + 18]
16459palignr m3, m0, 1
16460punpcklbw m0, m3
16461pmaddubsw m3, m0, m6
16462pmulhrsw m3, m7
16463movu m4, [r4 + 25]
16464movd m5, [r4 + 26]
16465palignr m5, m4, 1
16466punpcklbw m4, m5
16467pmaddubsw m5, m4, m6
16468pmulhrsw m5, m7
16469packuswb m3, m5
16470movu [r0 + 65 * 16], m3
16471
16472; mode 6 [row 1 - second half]
16473movu [r0 + 259 * 16], m3
16474
16475; mode 9 [row 12 - second half]
16476movu [r0 + 473 * 16], m3
16477
16478; mode 4 [row 0]
16479movu m6, [r5 + 21 * 16]
16480pmaddubsw m3, m1, m6
16481pmulhrsw m3, m7
16482pmaddubsw m5, m2, m6
16483pmulhrsw m5, m7
16484packuswb m3, m5
16485movu [r0 + 128 * 16], m3
16486pmaddubsw m3, m0, m6
16487pmulhrsw m3, m7
16488pmaddubsw m5, m4, m6
16489pmulhrsw m5, m7
16490packuswb m3, m5
16491movu [r0 + 129 * 16], m3
16492
16493; mode 5 [row 0]
16494movu m6, [r5 + 17 * 16]
16495pmaddubsw m3, m1, m6
16496pmulhrsw m3, m7
16497pmaddubsw m5, m2, m6
16498pmulhrsw m5, m7
16499packuswb m3, m5
16500movu [r0 + 192 * 16], m3
16501pmaddubsw m3, m0, m6
16502pmulhrsw m3, m7
16503pmaddubsw m5, m4, m6
16504pmulhrsw m5, m7
16505packuswb m3, m5
16506movu [r0 + 193 * 16], m3
16507
16508; mode 6 [row 0]
16509movu m6, [r5 + 13 * 16]
16510pmaddubsw m3, m1, m6
16511pmulhrsw m3, m7
16512pmaddubsw m5, m2, m6
16513pmulhrsw m5, m7
16514packuswb m3, m5
16515movu [r0 + 256 * 16], m3
16516pmaddubsw m3, m0, m6
16517pmulhrsw m3, m7
16518pmaddubsw m5, m4, m6
16519pmulhrsw m5, m7
16520packuswb m3, m5
16521movu [r0 + 257 * 16], m3
16522
16523; mode 7 [row 0]
16524movu m6, [r5 + 9 * 16]
16525pmaddubsw m3, m1, m6
16526pmulhrsw m3, m7
16527pmaddubsw m5, m2, m6
16528pmulhrsw m5, m7
16529packuswb m3, m5
16530movu [r0 + 320 * 16], m3
16531pmaddubsw m3, m0, m6
16532pmulhrsw m3, m7
16533pmaddubsw m5, m4, m6
16534pmulhrsw m5, m7
16535packuswb m3, m5
16536movu [r0 + 321 * 16], m3
16537
16538; mode 7 [row 1]
16539movu m6, [r5 + 18 * 16]
16540pmaddubsw m3, m1, m6
16541pmulhrsw m3, m7
16542pmaddubsw m5, m2, m6
16543pmulhrsw m5, m7
16544packuswb m3, m5
16545movu [r0 + 322 * 16], m3
16546
16547; mode 9 [row 8 - first half]
16548movu [r0 + 464 * 16], m3
16549
16550pmaddubsw m3, m0, m6
16551pmulhrsw m3, m7
16552pmaddubsw m5, m4, m6
16553pmulhrsw m5, m7
16554packuswb m3, m5
16555movu [r0 + 323 * 16], m3
16556
16557; mode 9 [row 8 - second half]
16558movu [r0 + 465 * 16], m3
16559
16560; mode 7 [row 2]
16561movu m6, [r5 + 27 * 16]
16562pmaddubsw m3, m1, m6
16563pmulhrsw m3, m7
16564pmaddubsw m5, m2, m6
16565pmulhrsw m5, m7
16566packuswb m3, m5
16567movu [r0 + 324 * 16], m3
16568pmaddubsw m3, m0, m6
16569pmulhrsw m3, m7
16570pmaddubsw m5, m4, m6
16571pmulhrsw m5, m7
16572packuswb m3, m5
16573movu [r0 + 325 * 16], m3
16574
16575; mode 8 [row 0]
16576movu m6, [r5 + 5 * 16]
16577pmaddubsw m3, m1, m6
16578pmulhrsw m3, m7
16579pmaddubsw m5, m2, m6
16580pmulhrsw m5, m7
16581packuswb m3, m5
16582movu [r0 + 384 * 16], m3
16583pmaddubsw m3, m0, m6
16584pmulhrsw m3, m7
16585pmaddubsw m5, m4, m6
16586pmulhrsw m5, m7
16587packuswb m3, m5
16588movu [r0 + 385 * 16], m3
16589
16590; mode 8 [row 1]
16591movu m6, [r5 + 10 * 16]
16592pmaddubsw m3, m1, m6
16593pmulhrsw m3, m7
16594pmaddubsw m5, m2, m6
16595pmulhrsw m5, m7
16596packuswb m3, m5
16597movu [r0 + 386 * 16], m3
16598
16599; mode 9 [row 4 - first half]
16600movu [r0 + 456 * 16], m3
16601
16602pmaddubsw m3, m0, m6
16603pmulhrsw m3, m7
16604pmaddubsw m5, m4, m6
16605pmulhrsw m5, m7
16606packuswb m3, m5
16607movu [r0 + 387 * 16], m3
16608
16609; mode 9 [row 4 - second half]
16610movu [r0 + 457 * 16], m3
16611
16612; mode 8 [row 2]
16613movu m6, [r5 + 15 * 16]
16614pmaddubsw m3, m1, m6
16615pmulhrsw m3, m7
16616pmaddubsw m5, m2, m6
16617pmulhrsw m5, m7
16618packuswb m3, m5
16619movu [r0 + 388 * 16], m3
16620pmaddubsw m3, m0, m6
16621pmulhrsw m3, m7
16622pmaddubsw m5, m4, m6
16623pmulhrsw m5, m7
16624packuswb m3, m5
16625movu [r0 + 389 * 16], m3
16626
16627; mode 8 [row 3]
16628movu m6, [r5 + 20 * 16]
16629pmaddubsw m3, m1, m6
16630pmulhrsw m3, m7
16631pmaddubsw m5, m2, m6
16632pmulhrsw m5, m7
16633packuswb m3, m5
16634movu [r0 + 390 * 16], m3
16635
16636; mode 9 [row 9 - first half]
16637movu [r0 + 466 * 16], m3
16638
16639pmaddubsw m3, m0, m6
16640pmulhrsw m3, m7
16641pmaddubsw m5, m4, m6
16642pmulhrsw m5, m7
16643packuswb m3, m5
16644movu [r0 + 391 * 16], m3
16645
16646; mode 9 [row 9 - second half]
16647movu [r0 + 467 * 16], m3
16648
16649; mode 8 [row 4]
16650movu m6, [r5 + 25 * 16]
16651pmaddubsw m3, m1, m6
16652pmulhrsw m3, m7
16653pmaddubsw m5, m2, m6
16654pmulhrsw m5, m7
16655packuswb m3, m5
16656movu [r0 + 392 * 16], m3
16657pmaddubsw m3, m0, m6
16658pmulhrsw m3, m7
16659pmaddubsw m5, m4, m6
16660pmulhrsw m5, m7
16661packuswb m3, m5
16662movu [r0 + 393 * 16], m3
16663
16664; mode 8 [row 5]
16665movu m6, [r5 + 30 * 16]
16666pmaddubsw m3, m1, m6
16667pmulhrsw m3, m7
16668pmaddubsw m5, m2, m6
16669pmulhrsw m5, m7
16670packuswb m3, m5
16671movu [r0 + 394 * 16], m3
16672
16673; mode 9 [row 14 - first half]
16674movu [r0 + 476 * 16], m3
16675
16676pmaddubsw m3, m0, m6
16677pmulhrsw m3, m7
16678pmaddubsw m5, m4, m6
16679pmulhrsw m5, m7
16680packuswb m3, m5
16681movu [r0 + 395 * 16], m3
16682
16683; mode 9 [row 14 - second half]
16684movu [r0 + 477 * 16], m3
16685
16686; mode 9 [row 0]
16687movu m6, [r5 + 2 * 16]
16688pmaddubsw m3, m1, m6
16689pmulhrsw m3, m7
16690pmaddubsw m5, m2, m6
16691pmulhrsw m5, m7
16692packuswb m3, m5
16693movu [r0 + 448 * 16], m3
16694pmaddubsw m3, m0, m6
16695pmulhrsw m3, m7
16696pmaddubsw m5, m4, m6
16697pmulhrsw m5, m7
16698packuswb m3, m5
16699movu [r0 + 449 * 16], m3
16700
16701; mode 9 [row 1]
16702movu m6, [r5 + 4 * 16]
16703pmaddubsw m3, m1, m6
16704pmulhrsw m3, m7
16705pmaddubsw m5, m2, m6
16706pmulhrsw m5, m7
16707packuswb m3, m5
16708movu [r0 + 450 * 16], m3
16709pmaddubsw m3, m0, m6
16710pmulhrsw m3, m7
16711pmaddubsw m5, m4, m6
16712pmulhrsw m5, m7
16713packuswb m3, m5
16714movu [r0 + 451 * 16], m3
16715
16716; mode 9 [row 2]
16717movu m6, [r5 + 6 * 16]
16718pmaddubsw m3, m1, m6
16719pmulhrsw m3, m7
16720pmaddubsw m5, m2, m6
16721pmulhrsw m5, m7
16722packuswb m3, m5
16723movu [r0 + 452 * 16], m3
16724pmaddubsw m3, m0, m6
16725pmulhrsw m3, m7
16726pmaddubsw m5, m4, m6
16727pmulhrsw m5, m7
16728packuswb m3, m5
16729movu [r0 + 453 * 16], m3
16730
16731; mode 9 [row 3]
16732movu m6, [r5 + 8 * 16]
16733pmaddubsw m3, m1, m6
16734pmulhrsw m3, m7
16735pmaddubsw m5, m2, m6
16736pmulhrsw m5, m7
16737packuswb m3, m5
16738movu [r0 + 454 * 16], m3
16739pmaddubsw m3, m0, m6
16740pmulhrsw m3, m7
16741pmaddubsw m5, m4, m6
16742pmulhrsw m5, m7
16743packuswb m3, m5
16744movu [r0 + 455 * 16], m3
16745
16746; mode 9 [row 5]
16747movu m6, [r5 + 12 * 16]
16748pmaddubsw m3, m1, m6
16749pmulhrsw m3, m7
16750pmaddubsw m5, m2, m6
16751pmulhrsw m5, m7
16752packuswb m3, m5
16753movu [r0 + 458 * 16], m3
16754pmaddubsw m3, m0, m6
16755pmulhrsw m3, m7
16756pmaddubsw m5, m4, m6
16757pmulhrsw m5, m7
16758packuswb m3, m5
16759movu [r0 + 459 * 16], m3
16760
16761; mode 9 [row 6]
16762movu m6, [r5 + 14 * 16]
16763pmaddubsw m3, m1, m6
16764pmulhrsw m3, m7
16765pmaddubsw m5, m2, m6
16766pmulhrsw m5, m7
16767packuswb m3, m5
16768movu [r0 + 460 * 16], m3
16769pmaddubsw m3, m0, m6
16770pmulhrsw m3, m7
16771pmaddubsw m5, m4, m6
16772pmulhrsw m5, m7
16773packuswb m3, m5
16774movu [r0 + 461 * 16], m3
16775
16776; mode 9 [row 7]
16777movu m6, [r5 + 16 * 16]
16778pmaddubsw m3, m1, m6
16779pmulhrsw m3, m7
16780pmaddubsw m5, m2, m6
16781pmulhrsw m5, m7
16782packuswb m3, m5
16783movu [r0 + 462 * 16], m3
16784pmaddubsw m3, m0, m6
16785pmulhrsw m3, m7
16786pmaddubsw m5, m4, m6
16787pmulhrsw m5, m7
16788packuswb m3, m5
16789movu [r0 + 463 * 16], m3
16790
16791; mode 9 [row 10]
16792movu m6, [r5 + 22 * 16]
16793pmaddubsw m3, m1, m6
16794pmulhrsw m3, m7
16795pmaddubsw m5, m2, m6
16796pmulhrsw m5, m7
16797packuswb m3, m5
16798movu [r0 + 468 * 16], m3
16799pmaddubsw m3, m0, m6
16800pmulhrsw m3, m7
16801pmaddubsw m5, m4, m6
16802pmulhrsw m5, m7
16803packuswb m3, m5
16804movu [r0 + 469 * 16], m3
16805
16806; mode 9 [row 11]
16807movu m6, [r5 + 24 * 16]
16808pmaddubsw m3, m1, m6
16809pmulhrsw m3, m7
16810pmaddubsw m5, m2, m6
16811pmulhrsw m5, m7
16812packuswb m3, m5
16813movu [r0 + 470 * 16], m3
16814pmaddubsw m3, m0, m6
16815pmulhrsw m3, m7
16816pmaddubsw m5, m4, m6
16817pmulhrsw m5, m7
16818packuswb m3, m5
16819movu [r0 + 471 * 16], m3
16820
16821; mode 9 [row 13]
16822movu m6, [r5 + 28 * 16]
16823pmaddubsw m3, m1, m6
16824pmulhrsw m3, m7
16825pmaddubsw m5, m2, m6
16826pmulhrsw m5, m7
16827packuswb m3, m5
16828movu [r0 + 474 * 16], m3
16829pmaddubsw m3, m0, m6
16830pmulhrsw m3, m7
16831pmaddubsw m5, m4, m6
16832pmulhrsw m5, m7
16833packuswb m3, m5
16834movu [r0 + 475 * 16], m3
16835
16836; mode 3 [row 1]
16837movu m6, [r5 + 20 * 16]
16838movu m0, [r4 + 2]
16839movd m1, [r4 + 3]
16840palignr m1, m0, 1
16841punpcklbw m0, m1
16842pmaddubsw m1, m0, m6
16843pmulhrsw m1, m7
16844movu m2, [r4 + 10]
16845movd m3, [r4 + 11]
16846palignr m3, m2, 1
16847punpcklbw m2, m3
16848pmaddubsw m3, m2, m6
16849pmulhrsw m3, m7
16850packuswb m1, m3
16851movu [r0 + 66 * 16], m1
16852
16853; mode 6 [row 3 - first half]
16854movu [r0 + 262 * 16], m1
16855
16856; mode 9 [row 25 - first half]
16857movu [r0 + 498 * 16], m1
16858
16859movu m1, [r4 + 18]
16860movd m3, [r4 + 19]
16861palignr m3, m1, 1
16862punpcklbw m1, m3
16863pmaddubsw m3, m1, m6
16864pmulhrsw m3, m7
16865movu m4, [r4 + 26]
16866movd m5, [r4 + 27]
16867palignr m5, m4, 1
16868punpcklbw m4, m5
16869pmaddubsw m5, m4, m6
16870pmulhrsw m5, m7
16871packuswb m3, m5
16872movu [r0 + 67 * 16], m3
16873
16874; mode 6 [row 3 - second half]
16875movu [r0 + 263 * 16], m3
16876
16877; mode 9 [row 25 - second half]
16878movu [r0 + 499 * 16], m3
16879
16880; mode 4 [row 1]
16881movu m6, [r5 + 10 * 16]
16882pmaddubsw m3, m0, m6
16883pmulhrsw m3, m7
16884pmaddubsw m5, m2, m6
16885pmulhrsw m5, m7
16886packuswb m3, m5
16887movu [r0 + 130 * 16], m3
16888
16889; mode 9 [row 20 - first half]
16890movu [r0 + 488 * 16], m3
16891
16892pmaddubsw m3, m1, m6
16893pmulhrsw m3, m7
16894pmaddubsw m5, m4, m6
16895pmulhrsw m5, m7
16896packuswb m3, m5
16897movu [r0 + 131 * 16], m3
16898
16899; mode 9 [row 20 - second half]
16900movu [r0 + 489 * 16], m3
16901
16902; mode 4 [row 2]
16903movu m6, [r5 + 31 * 16]
16904pmaddubsw m3, m0, m6
16905pmulhrsw m3, m7
16906pmaddubsw m5, m2, m6
16907pmulhrsw m5, m7
16908packuswb m3, m5
16909movu [r0 + 132 * 16], m3
16910
16911; mode 7 [row 6 - first half]
16912movu [r0 + 332 * 16], m3
16913
16914pmaddubsw m3, m1, m6
16915pmulhrsw m3, m7
16916pmaddubsw m5, m4, m6
16917pmulhrsw m5, m7
16918packuswb m3, m5
16919movu [r0 + 133 * 16], m3
16920
16921; mode 7 [row 6 - second half]
16922movu [r0 + 333 * 16], m3
16923
16924; mode 5 [row 1]
16925movu m6, [r5 + 2 * 16]
16926pmaddubsw m3, m0, m6
16927pmulhrsw m3, m7
16928pmaddubsw m5, m2, m6
16929pmulhrsw m5, m7
16930packuswb m3, m5
16931movu [r0 + 194 * 16], m3
16932
16933; mode 5 [row 1 - first half]
16934movu [r0 + 480 * 16], m3
16935
16936pmaddubsw m3, m1, m6
16937pmulhrsw m3, m7
16938pmaddubsw m5, m4, m6
16939pmulhrsw m5, m7
16940packuswb m3, m5
16941movu [r0 + 195 * 16], m3
16942
16943; mode 5 [row 1 - second half]
16944movu [r0 + 481 * 16], m3
16945
16946; mode 5 [row 2]
16947movu m6, [r5 + 19 * 16]
16948pmaddubsw m3, m0, m6
16949pmulhrsw m3, m7
16950pmaddubsw m5, m2, m6
16951pmulhrsw m5, m7
16952packuswb m3, m5
16953movu [r0 + 196 * 16], m3
16954pmaddubsw m3, m1, m6
16955pmulhrsw m3, m7
16956pmaddubsw m5, m4, m6
16957pmulhrsw m5, m7
16958packuswb m3, m5
16959movu [r0 + 197 * 16], m3
16960
16961; mode 6 [row 2]
16962movu m6, [r5 + 7 * 16]
16963pmaddubsw m3, m0, m6
16964pmulhrsw m3, m7
16965pmaddubsw m5, m2, m6
16966pmulhrsw m5, m7
16967packuswb m3, m5
16968movu [r0 + 260 * 16], m3
16969pmaddubsw m3, m1, m6
16970pmulhrsw m3, m7
16971pmaddubsw m5, m4, m6
16972pmulhrsw m5, m7
16973packuswb m3, m5
16974movu [r0 + 261 * 16], m3
16975
16976; mode 7 [row 3]
16977movu m6, [r5 + 4 * 16]
16978pmaddubsw m3, m0, m6
16979pmulhrsw m3, m7
16980pmaddubsw m5, m2, m6
16981pmulhrsw m5, m7
16982packuswb m3, m5
16983movu [r0 + 326 * 16], m3
16984
16985; mode 9 [row 17 - first half]
16986movu [r0 + 482 * 16], m3
16987
16988pmaddubsw m3, m1, m6
16989pmulhrsw m3, m7
16990pmaddubsw m5, m4, m6
16991pmulhrsw m5, m7
16992packuswb m3, m5
16993movu [r0 + 327 * 16], m3
16994
16995; mode 9 [row 17 - second half]
16996movu [r0 + 483 * 16], m3
16997
16998; mode 7 [row 4]
16999movu m6, [r5 + 13 * 16]
17000pmaddubsw m3, m0, m6
17001pmulhrsw m3, m7
17002pmaddubsw m5, m2, m6
17003pmulhrsw m5, m7
17004packuswb m3, m5
17005movu [r0 + 328 * 16], m3
17006
17007; mode 8 [row 8 - first half]
17008movu [r0 + 400 * 16], m3
17009
17010pmaddubsw m3, m1, m6
17011pmulhrsw m3, m7
17012pmaddubsw m5, m4, m6
17013pmulhrsw m5, m7
17014packuswb m3, m5
17015movu [r0 + 329 * 16], m3
17016
17017; mode 8 [row 8 - second half]
17018movu [r0 + 401 * 16], m3
17019
17020; mode 7 [row 5]
17021movu m6, [r5 + 22 * 16]
17022pmaddubsw m3, m0, m6
17023pmulhrsw m3, m7
17024pmaddubsw m5, m2, m6
17025pmulhrsw m5, m7
17026packuswb m3, m5
17027movu [r0 + 330 * 16], m3
17028
17029; mode 9 [row 26 - first half]
17030movu [r0 + 500 * 16], m3
17031
17032pmaddubsw m3, m1, m6
17033pmulhrsw m3, m7
17034pmaddubsw m5, m4, m6
17035pmulhrsw m5, m7
17036packuswb m3, m5
17037movu [r0 + 331 * 16], m3
17038
17039; mode 9 [row 26 - second half]
17040movu [r0 + 501 * 16], m3
17041
17042; mode 8 [row 6]
17043movu m6, [r5 + 3 * 16]
17044pmaddubsw m3, m0, m6
17045pmulhrsw m3, m7
17046pmaddubsw m5, m2, m6
17047pmulhrsw m5, m7
17048packuswb m3, m5
17049movu [r0 + 396 * 16], m3
17050pmaddubsw m3, m1, m6
17051pmulhrsw m3, m7
17052pmaddubsw m5, m4, m6
17053pmulhrsw m5, m7
17054packuswb m3, m5
17055movu [r0 + 397 * 16], m3
17056
17057; mode 9 [row 18]
17058movu m6, [r5 + 6 * 16]
17059pmaddubsw m3, m0, m6
17060pmulhrsw m3, m7
17061pmaddubsw m5, m2, m6
17062pmulhrsw m5, m7
17063packuswb m3, m5
17064movu [r0 + 484 * 16], m3
17065pmaddubsw m3, m1, m6
17066pmulhrsw m3, m7
17067pmaddubsw m5, m4, m6
17068pmulhrsw m5, m7
17069packuswb m3, m5
17070movu [r0 + 485 * 16], m3
17071
17072; mode 9 [row 21]
17073movu m6, [r5 + 12 * 16]
17074pmaddubsw m3, m0, m6
17075pmulhrsw m3, m7
17076pmaddubsw m5, m2, m6
17077pmulhrsw m5, m7
17078packuswb m3, m5
17079movu [r0 + 490 * 16], m3
17080pmaddubsw m3, m1, m6
17081pmulhrsw m3, m7
17082pmaddubsw m5, m4, m6
17083pmulhrsw m5, m7
17084packuswb m3, m5
17085movu [r0 + 491 * 16], m3
17086
17087; mode 9 [row 22]
17088movu m6, [r5 + 14 * 16]
17089pmaddubsw m3, m0, m6
17090pmulhrsw m3, m7
17091pmaddubsw m5, m2, m6
17092pmulhrsw m5, m7
17093packuswb m3, m5
17094movu [r0 + 492 * 16], m3
17095pmaddubsw m3, m1, m6
17096pmulhrsw m3, m7
17097pmaddubsw m5, m4, m6
17098pmulhrsw m5, m7
17099packuswb m3, m5
17100movu [r0 + 493 * 16], m3
17101
17102; mode 9 [row 23]
17103movu m6, [r5 + 16 * 16]
17104pmaddubsw m3, m0, m6
17105pmulhrsw m3, m7
17106pmaddubsw m5, m2, m6
17107pmulhrsw m5, m7
17108packuswb m3, m5
17109movu [r0 + 494 * 16], m3
17110pmaddubsw m3, m1, m6
17111pmulhrsw m3, m7
17112pmaddubsw m5, m4, m6
17113pmulhrsw m5, m7
17114packuswb m3, m5
17115movu [r0 + 495 * 16], m3
17116
17117; mode 9 [row 27]
17118movu m6, [r5 + 24 * 16]
17119pmaddubsw m3, m0, m6
17120pmulhrsw m3, m7
17121pmaddubsw m5, m2, m6
17122pmulhrsw m5, m7
17123packuswb m3, m5
17124movu [r0 + 502 * 16], m3
17125pmaddubsw m3, m1, m6
17126pmulhrsw m3, m7
17127pmaddubsw m5, m4, m6
17128pmulhrsw m5, m7
17129packuswb m3, m5
17130movu [r0 + 503 * 16], m3
17131
17132; mode 9 [row 28]
17133movu m6, [r5 + 26 * 16]
17134pmaddubsw m3, m0, m6
17135pmulhrsw m3, m7
17136pmaddubsw m5, m2, m6
17137pmulhrsw m5, m7
17138packuswb m3, m5
17139movu [r0 + 504 * 16], m3
17140pmaddubsw m3, m1, m6
17141pmulhrsw m3, m7
17142pmaddubsw m5, m4, m6
17143pmulhrsw m5, m7
17144packuswb m3, m5
17145movu [r0 + 505 * 16], m3
17146
17147; mode 9 [row 30]
17148movu m6, [r5 + 30 * 16]
17149pmaddubsw m3, m0, m6
17150pmulhrsw m3, m7
17151pmaddubsw m5, m2, m6
17152pmulhrsw m5, m7
17153packuswb m3, m5
17154movu [r0 + 508 * 16], m3
17155pmaddubsw m3, m1, m6
17156pmulhrsw m3, m7
17157pmaddubsw m5, m4, m6
17158pmulhrsw m5, m7
17159packuswb m3, m5
17160movu [r0 + 509 * 16], m3
17161
17162; mode 8 [row 7]
17163movu m6, [r5 + 8 * 16]
17164pmaddubsw m3, m0, m6
17165pmulhrsw m3, m7
17166pmaddubsw m5, m2, m6
17167pmulhrsw m5, m7
17168packuswb m3, m5
17169movu [r0 + 398 * 16], m3
17170
17171; mode 9 [row 19 - first half]
17172movu [r0 + 486 * 16], m3
17173
17174pmaddubsw m3, m1, m6
17175pmulhrsw m3, m7
17176pmaddubsw m5, m4, m6
17177pmulhrsw m5, m7
17178packuswb m3, m5
17179movu [r0 + 399 * 16], m3
17180
17181; mode 9 [row 19 - second half]
17182movu [r0 + 487 * 16], m3
17183
17184; mode 8 [row 9]
17185movu m6, [r5 + 18 * 16]
17186pmaddubsw m3, m0, m6
17187pmulhrsw m3, m7
17188pmaddubsw m5, m2, m6
17189pmulhrsw m5, m7
17190packuswb m3, m5
17191movu [r0 + 402 * 16], m3
17192
17193; mode 9 [row 24 - first half]
17194movu [r0 + 496 * 16], m3
17195
17196pmaddubsw m3, m1, m6
17197pmulhrsw m3, m7
17198pmaddubsw m5, m4, m6
17199pmulhrsw m5, m7
17200packuswb m3, m5
17201movu [r0 + 403 * 16], m3
17202
17203; mode 9 [row 24 - second half]
17204movu [r0 + 497 * 16], m3
17205
17206; mode 8 [row 10]
17207movu m6, [r5 + 23 * 16]
17208pmaddubsw m3, m0, m6
17209pmulhrsw m3, m7
17210pmaddubsw m5, m2, m6
17211pmulhrsw m5, m7
17212packuswb m3, m5
17213movu [r0 + 404 * 16], m3
17214pmaddubsw m3, m1, m6
17215pmulhrsw m3, m7
17216pmaddubsw m5, m4, m6
17217pmulhrsw m5, m7
17218packuswb m3, m5
17219movu [r0 + 405 * 16], m3
17220
17221; mode 8 [row 11]
17222movu m6, [r5 + 28 * 16]
17223pmaddubsw m3, m0, m6
17224pmulhrsw m3, m7
17225pmaddubsw m5, m2, m6
17226pmulhrsw m5, m7
17227packuswb m3, m5
17228movu [r0 + 406 * 16], m3
17229
17230; mode 9 [row 29 - first half]
17231movu [r0 + 506 * 16], m3
17232
17233pmaddubsw m3, m1, m6
17234pmulhrsw m3, m7
17235pmaddubsw m5, m4, m6
17236pmulhrsw m5, m7
17237packuswb m3, m5
17238movu [r0 + 407 * 16], m3
17239
17240; mode 9 [row 29 - second half]
17241movu [r0 + 507 * 16], m3
17242
17243; mode 3 [row 2]
17244movu m6, [r5 + 14 * 16]
17245movu m0, [r4 + 3]
17246movd m1, [r4 + 4]
17247palignr m1, m0, 1
17248punpcklbw m0, m1
17249pmaddubsw m1, m0, m6
17250pmulhrsw m1, m7
17251movu m2, [r4 + 11]
17252movd m3, [r4 + 12]
17253palignr m3, m2, 1
17254punpcklbw m2, m3
17255pmaddubsw m3, m2, m6
17256pmulhrsw m3, m7
17257packuswb m1, m3
17258movu [r0 + 68 * 16], m1
17259
17260; mode 3 [row 2 - first half]
17261movu [r0 + 266 * 16], m1
17262
17263movu m1, [r4 + 19]
17264movd m3, [r4 + 20]
17265palignr m3, m1, 1
17266punpcklbw m1, m3
17267pmaddubsw m3, m1, m6
17268pmulhrsw m3, m7
17269movu m4, [r4 + 27]
17270movd m5, [r4 + 28]
17271palignr m5, m4, 1
17272punpcklbw m4, m5
17273pmaddubsw m5, m4, m6
17274pmulhrsw m5, m7
17275packuswb m3, m5
17276movu [r0 + 69 * 16], m3
17277
17278; mode 3 [row 2 - second half]
17279movu [r0 + 267 * 16], m3
17280
17281; mode 4 [row 3]
17282movu m6, [r5 + 20 * 16]
17283pmaddubsw m3, m0, m6
17284pmulhrsw m3, m7
17285pmaddubsw m5, m2, m6
17286pmulhrsw m5, m7
17287packuswb m3, m5
17288movu [r0 + 134 * 16], m3
17289pmaddubsw m3, m1, m6
17290pmulhrsw m3, m7
17291pmaddubsw m5, m4, m6
17292pmulhrsw m5, m7
17293packuswb m3, m5
17294movu [r0 + 135 * 16], m3
17295
17296; mode 5 [row 3]
17297movu m6, [r5 + 4 * 16]
17298pmaddubsw m3, m0, m6
17299pmulhrsw m3, m7
17300pmaddubsw m5, m2, m6
17301pmulhrsw m5, m7
17302packuswb m3, m5
17303movu [r0 + 198 * 16], m3
17304pmaddubsw m3, m1, m6
17305pmulhrsw m3, m7
17306pmaddubsw m5, m4, m6
17307pmulhrsw m5, m7
17308packuswb m3, m5
17309movu [r0 + 199 * 16], m3
17310
17311; mode 5 [row 4]
17312movu m6, [r5 + 21 * 16]
17313pmaddubsw m3, m0, m6
17314pmulhrsw m3, m7
17315pmaddubsw m5, m2, m6
17316pmulhrsw m5, m7
17317packuswb m3, m5
17318movu [r0 + 200 * 16], m3
17319
17320; mode 8 [row 16 - first half]
17321movu [r0 + 416 * 16], m3
17322
17323pmaddubsw m3, m1, m6
17324pmulhrsw m3, m7
17325pmaddubsw m5, m4, m6
17326pmulhrsw m5, m7
17327packuswb m3, m5
17328movu [r0 + 201 * 16], m3
17329
17330; mode 8 [row 16 - second half]
17331movu [r0 + 417 * 16], m3
17332
17333; mode 6 [row 4]
17334movu m6, [r5 + 1 * 16]
17335pmaddubsw m3, m0, m6
17336pmulhrsw m3, m7
17337pmaddubsw m5, m2, m6
17338pmulhrsw m5, m7
17339packuswb m3, m5
17340movu [r0 + 264 * 16], m3
17341
17342; mode 6 [row 4 - first half]
17343movu [r0 + 408 * 16], m3
17344
17345pmaddubsw m3, m1, m6
17346pmulhrsw m3, m7
17347pmaddubsw m5, m4, m6
17348pmulhrsw m5, m7
17349packuswb m3, m5
17350movu [r0 + 265 * 16], m3
17351
17352; mode 6 [row 4 - second half]
17353movu [r0 + 409 * 16], m3
17354
17355; mode 6 [row 6]
17356movu m6, [r5 + 27 * 16]
17357pmaddubsw m3, m0, m6
17358pmulhrsw m3, m7
17359pmaddubsw m5, m2, m6
17360pmulhrsw m5, m7
17361packuswb m3, m5
17362movu [r0 + 268 * 16], m3
17363pmaddubsw m3, m1, m6
17364pmulhrsw m3, m7
17365pmaddubsw m5, m4, m6
17366pmulhrsw m5, m7
17367packuswb m3, m5
17368movu [r0 + 269 * 16], m3
17369
17370; mode 7 [row 7]
17371movu m6, [r5 + 8 * 16]
17372pmaddubsw m3, m0, m6
17373pmulhrsw m3, m7
17374pmaddubsw m5, m2, m6
17375pmulhrsw m5, m7
17376packuswb m3, m5
17377movu [r0 + 334 * 16], m3
17378pmaddubsw m3, m1, m6
17379pmulhrsw m3, m7
17380pmaddubsw m5, m4, m6
17381pmulhrsw m5, m7
17382packuswb m3, m5
17383movu [r0 + 335 * 16], m3
17384
17385; mode 7 [row 8]
17386movu m6, [r5 + 17 * 16]
17387pmaddubsw m3, m0, m6
17388pmulhrsw m3, m7
17389pmaddubsw m5, m2, m6
17390pmulhrsw m5, m7
17391packuswb m3, m5
17392movu [r0 + 336 * 16], m3
17393pmaddubsw m3, m1, m6
17394pmulhrsw m3, m7
17395pmaddubsw m5, m4, m6
17396pmulhrsw m5, m7
17397packuswb m3, m5
17398movu [r0 + 337 * 16], m3
17399
17400; mode 7 [row 9]
17401movu m6, [r5 + 26 * 16]
17402pmaddubsw m3, m0, m6
17403pmulhrsw m3, m7
17404pmaddubsw m5, m2, m6
17405pmulhrsw m5, m7
17406packuswb m3, m5
17407movu [r0 + 338 * 16], m3
17408
17409; mode 8 [row 17 - first half]
17410movu [r0 + 418 * 16], m3
17411
17412pmaddubsw m3, m1, m6
17413pmulhrsw m3, m7
17414pmaddubsw m5, m4, m6
17415pmulhrsw m5, m7
17416packuswb m3, m5
17417movu [r0 + 339 * 16], m3
17418
17419; mode 8 [row 17 - second half]
17420movu [r0 + 419 * 16], m3
17421
17422; mode 8 [row 13]
17423movu m6, [r5 + 6 * 16]
17424pmaddubsw m3, m0, m6
17425pmulhrsw m3, m7
17426pmaddubsw m5, m2, m6
17427pmulhrsw m5, m7
17428packuswb m3, m5
17429movu [r0 + 410 * 16], m3
17430pmaddubsw m3, m1, m6
17431pmulhrsw m3, m7
17432pmaddubsw m5, m4, m6
17433pmulhrsw m5, m7
17434packuswb m3, m5
17435movu [r0 + 411 * 16], m3
17436
17437; mode 8 [row 14]
17438movu m6, [r5 + 11 * 16]
17439pmaddubsw m3, m0, m6
17440pmulhrsw m3, m7
17441pmaddubsw m5, m2, m6
17442pmulhrsw m5, m7
17443packuswb m3, m5
17444movu [r0 + 412 * 16], m3
17445pmaddubsw m3, m1, m6
17446pmulhrsw m3, m7
17447pmaddubsw m5, m4, m6
17448pmulhrsw m5, m7
17449packuswb m3, m5
17450movu [r0 + 413 * 16], m3
17451
17452; mode 8 [row 15]
17453movu m6, [r5 + 16 * 16]
17454pmaddubsw m3, m0, m6
17455pmulhrsw m3, m7
17456pmaddubsw m5, m2, m6
17457pmulhrsw m5, m7
17458packuswb m3, m5
17459movu [r0 + 414 * 16], m3
17460pmaddubsw m3, m1, m6
17461pmulhrsw m3, m7
17462pmaddubsw m5, m4, m6
17463pmulhrsw m5, m7
17464packuswb m3, m5
17465movu [r0 + 415 * 16], m3
17466
17467; mode 8 [row 18]
17468movu m6, [r5 + 31 * 16]
17469pmaddubsw m3, m0, m6
17470pmulhrsw m3, m7
17471pmaddubsw m5, m2, m6
17472pmulhrsw m5, m7
17473packuswb m3, m5
17474movu [r0 + 420 * 16], m3
17475pmaddubsw m3, m1, m6
17476pmulhrsw m3, m7
17477pmaddubsw m5, m4, m6
17478pmulhrsw m5, m7
17479packuswb m3, m5
17480movu [r0 + 421 * 16], m3
17481
17482; mode 3 [row 3]
17483movu m6, [r5 + 8 * 16]
17484movu m0, [r4 + 4]
17485movd m1, [r4 + 5]
17486palignr m1, m0, 1
17487punpcklbw m0, m1
17488pmaddubsw m1, m0, m6
17489pmulhrsw m1, m7
17490movu m2, [r4 + 12]
17491movd m3, [r4 + 13]
17492palignr m3, m2, 1
17493punpcklbw m2, m3
17494pmaddubsw m3, m2, m6
17495pmulhrsw m3, m7
17496packuswb m1, m3
17497movu [r0 + 70 * 16], m1
17498
17499; mode 6 [row 7 - first half]
17500movu [r0 + 270 * 16], m1
17501
17502movu m1, [r4 + 20]
17503movd m3, [r4 + 21]
17504palignr m3, m1, 1
17505punpcklbw m1, m3
17506pmaddubsw m3, m1, m6
17507pmulhrsw m3, m7
17508movu m4, [r4 + 28]
17509movd m5, [r4 + 29]
17510palignr m5, m4, 1
17511punpcklbw m4, m5
17512pmaddubsw m5, m4, m6
17513pmulhrsw m5, m7
17514packuswb m3, m5
17515movu [r0 + 71 * 16], m3
17516
17517; mode 6 [row 7 - second half]
17518movu [r0 + 271 * 16], m3
17519
17520; mode 4 [row 4]
17521movu m6, [r5 + 9 * 16]
17522pmaddubsw m3, m0, m6
17523pmulhrsw m3, m7
17524pmaddubsw m5, m2, m6
17525pmulhrsw m5, m7
17526packuswb m3, m5
17527movu [r0 + 136 * 16], m3
17528
17529; mode 4 [row 4 - first half]
17530movu [r0 + 424 * 16], m3
17531
17532pmaddubsw m3, m1, m6
17533pmulhrsw m3, m7
17534pmaddubsw m5, m4, m6
17535pmulhrsw m5, m7
17536packuswb m3, m5
17537movu [r0 + 137 * 16], m3
17538
17539; mode 4 [row 4 - second half]
17540movu [r0 + 425 * 16], m3
17541
17542; mode 4 [row 5]
17543movu m6, [r5 + 30 * 16]
17544pmaddubsw m3, m0, m6
17545pmulhrsw m3, m7
17546pmaddubsw m5, m2, m6
17547pmulhrsw m5, m7
17548packuswb m3, m5
17549movu [r0 + 138 * 16], m3
17550
17551; mode 7 [row 13 - first half]
17552movu [r0 + 346 * 16], m3
17553
17554pmaddubsw m3, m1, m6
17555pmulhrsw m3, m7
17556pmaddubsw m5, m4, m6
17557pmulhrsw m5, m7
17558packuswb m3, m5
17559movu [r0 + 139 * 16], m3
17560
17561; mode 7 [row 13 - second half]
17562movu [r0 + 347 * 16], m3
17563
17564; mode 5 [row 5]
17565movu m6, [r5 + 6 * 16]
17566pmaddubsw m3, m0, m6
17567pmulhrsw m3, m7
17568pmaddubsw m5, m2, m6
17569pmulhrsw m5, m7
17570packuswb m3, m5
17571movu [r0 + 202 * 16], m3
17572pmaddubsw m3, m1, m6
17573pmulhrsw m3, m7
17574pmaddubsw m5, m4, m6
17575pmulhrsw m5, m7
17576packuswb m3, m5
17577movu [r0 + 203 * 16], m3
17578
17579; mode 5 [row 6]
17580movu m6, [r5 + 23 * 16]
17581pmaddubsw m3, m0, m6
17582pmulhrsw m3, m7
17583pmaddubsw m5, m2, m6
17584pmulhrsw m5, m7
17585packuswb m3, m5
17586movu [r0 + 204 * 16], m3
17587pmaddubsw m3, m1, m6
17588pmulhrsw m3, m7
17589pmaddubsw m5, m4, m6
17590pmulhrsw m5, m7
17591packuswb m3, m5
17592movu [r0 + 205 * 16], m3
17593
17594; mode 6 [row 8]
17595movu m6, [r5 + 21 * 16]
17596pmaddubsw m3, m0, m6
17597pmulhrsw m3, m7
17598pmaddubsw m5, m2, m6
17599pmulhrsw m5, m7
17600packuswb m3, m5
17601movu [r0 + 272 * 16], m3
17602
17603; mode 7 [row 12 - first half]
17604movu [r0 + 344 * 16], m3
17605
17606pmaddubsw m3, m1, m6
17607pmulhrsw m3, m7
17608pmaddubsw m5, m4, m6
17609pmulhrsw m5, m7
17610packuswb m3, m5
17611movu [r0 + 273 * 16], m3
17612
17613; mode 7 [row 12 - second half]
17614movu [r0 + 345 * 16], m3
17615
17616; mode 7 [row 10]
17617movu m6, [r5 + 3 * 16]
17618pmaddubsw m3, m0, m6
17619pmulhrsw m3, m7
17620pmaddubsw m5, m2, m6
17621pmulhrsw m5, m7
17622packuswb m3, m5
17623movu [r0 + 340 * 16], m3
17624pmaddubsw m3, m1, m6
17625pmulhrsw m3, m7
17626pmaddubsw m5, m4, m6
17627pmulhrsw m5, m7
17628packuswb m3, m5
17629movu [r0 + 341 * 16], m3
17630
17631; mode 7 [row 11]
17632movu m6, [r5 + 12 * 16]
17633pmaddubsw m3, m0, m6
17634pmulhrsw m3, m7
17635pmaddubsw m5, m2, m6
17636pmulhrsw m5, m7
17637packuswb m3, m5
17638movu [r0 + 342 * 16], m3
17639pmaddubsw m3, m1, m6
17640pmulhrsw m3, m7
17641pmaddubsw m5, m4, m6
17642pmulhrsw m5, m7
17643packuswb m3, m5
17644movu [r0 + 343 * 16], m3
17645
17646; mode 8 [row 19]
17647movu m6, [r5 + 4 * 16]
17648pmaddubsw m3, m0, m6
17649pmulhrsw m3, m7
17650pmaddubsw m5, m2, m6
17651pmulhrsw m5, m7
17652packuswb m3, m5
17653movu [r0 + 422 * 16], m3
17654pmaddubsw m3, m1, m6
17655pmulhrsw m3, m7
17656pmaddubsw m5, m4, m6
17657pmulhrsw m5, m7
17658packuswb m3, m5
17659movu [r0 + 423 * 16], m3
17660
17661; mode 8 [row 21]
17662movu m6, [r5 + 14 * 16]
17663pmaddubsw m3, m0, m6
17664pmulhrsw m3, m7
17665pmaddubsw m5, m2, m6
17666pmulhrsw m5, m7
17667packuswb m3, m5
17668movu [r0 + 426 * 16], m3
17669pmaddubsw m3, m1, m6
17670pmulhrsw m3, m7
17671pmaddubsw m5, m4, m6
17672pmulhrsw m5, m7
17673packuswb m3, m5
17674movu [r0 + 427 * 16], m3
17675
17676; mode 8 [row 22]
17677movu m6, [r5 + 19 * 16]
17678pmaddubsw m3, m0, m6
17679pmulhrsw m3, m7
17680pmaddubsw m5, m2, m6
17681pmulhrsw m5, m7
17682packuswb m3, m5
17683movu [r0 + 428 * 16], m3
17684pmaddubsw m3, m1, m6
17685pmulhrsw m3, m7
17686pmaddubsw m5, m4, m6
17687pmulhrsw m5, m7
17688packuswb m3, m5
17689movu [r0 + 429 * 16], m3
17690
17691; mode 8 [row 23]
17692movu m6, [r5 + 24 * 16]
17693pmaddubsw m3, m0, m6
17694pmulhrsw m3, m7
17695pmaddubsw m5, m2, m6
17696pmulhrsw m5, m7
17697packuswb m3, m5
17698movu [r0 + 430 * 16], m3
17699pmaddubsw m3, m1, m6
17700pmulhrsw m3, m7
17701pmaddubsw m5, m4, m6
17702pmulhrsw m5, m7
17703packuswb m3, m5
17704movu [r0 + 431 * 16], m3
17705
17706; mode 8 [row 24]
17707movu m6, [r5 + 29 * 16]
17708pmaddubsw m3, m0, m6
17709pmulhrsw m3, m7
17710pmaddubsw m5, m2, m6
17711pmulhrsw m5, m7
17712packuswb m3, m5
17713movu [r0 + 432 * 16], m3
17714pmaddubsw m3, m1, m6
17715pmulhrsw m3, m7
17716pmaddubsw m5, m4, m6
17717pmulhrsw m5, m7
17718packuswb m3, m5
17719movu [r0 + 433 * 16], m3
17720
17721; mode 3 [row 4]
17722movu m6, [r5 + 2 * 16]
17723movu m0, [r4 + 5]
17724movd m1, [r4 + 6]
17725palignr m1, m0, 1
17726punpcklbw m0, m1
17727pmaddubsw m1, m0, m6
17728pmulhrsw m1, m7
17729movu m2, [r4 + 13]
17730movd m3, [r4 + 14]
17731palignr m3, m2, 1
17732punpcklbw m2, m3
17733pmaddubsw m3, m2, m6
17734pmulhrsw m3, m7
17735packuswb m1, m3
17736movu [r0 + 72 * 16], m1
17737
17738; mode 3 [row 4 - first half]
17739movu [r0 + 274 * 16], m1
17740
17741; mode 8 [row 25 - first half]
17742movu [r0 + 434 * 16], m1
17743
17744movu m1, [r4 + 21]
17745movd m3, [r4 + 22]
17746palignr m3, m1, 1
17747punpcklbw m1, m3
17748pmaddubsw m3, m1, m6
17749pmulhrsw m3, m7
17750movu m4, [r4 + 29]
17751movd m5, [r4 + 30]
17752palignr m5, m4, 1
17753punpcklbw m4, m5
17754pmaddubsw m5, m4, m6
17755pmulhrsw m5, m7
17756packuswb m3, m5
17757movu [r0 + 73 * 16], m3
17758
17759; mode 3 [row 4 - second half]
17760movu [r0 + 275 * 16], m3
17761
17762; mode 8 [row 25 - second half]
17763movu [r0 + 435 * 16], m3
17764
17765; mode 3 [row 5]
17766movu m6, [r5 + 28 * 16]
17767pmaddubsw m3, m0, m6
17768pmulhrsw m3, m7
17769pmaddubsw m5, m2, m6
17770pmulhrsw m5, m7
17771packuswb m3, m5
17772movu [r0 + 74 * 16], m3
17773
17774; mode 3 [row 5 - first half]
17775movu [r0 + 278 * 16], m3
17776
17777pmaddubsw m3, m1, m6
17778pmulhrsw m3, m7
17779pmaddubsw m5, m4, m6
17780pmulhrsw m5, m7
17781packuswb m3, m5
17782movu [r0 + 75 * 16], m3
17783
17784; mode 3 [row 5 - second half]
17785movu [r0 + 279 * 16], m3
17786
17787; mode 4 [row 6]
17788movu m6, [r5 + 19 * 16]
17789pmaddubsw m3, m0, m6
17790pmulhrsw m3, m7
17791pmaddubsw m5, m2, m6
17792pmulhrsw m5, m7
17793packuswb m3, m5
17794movu [r0 + 140 * 16], m3
17795pmaddubsw m3, m1, m6
17796pmulhrsw m3, m7
17797pmaddubsw m5, m4, m6
17798pmulhrsw m5, m7
17799packuswb m3, m5
17800movu [r0 + 141 * 16], m3
17801
17802; mode 5 [row 7]
17803movu m6, [r5 + 8 * 16]
17804pmaddubsw m3, m0, m6
17805pmulhrsw m3, m7
17806pmaddubsw m5, m2, m6
17807pmulhrsw m5, m7
17808packuswb m3, m5
17809movu [r0 + 206 * 16], m3
17810pmaddubsw m3, m1, m6
17811pmulhrsw m3, m7
17812pmaddubsw m5, m4, m6
17813pmulhrsw m5, m7
17814packuswb m3, m5
17815movu [r0 + 207 * 16], m3
17816
17817; mode 5 [row 8]
17818movu m6, [r5 + 25 * 16]
17819pmaddubsw m3, m0, m6
17820pmulhrsw m3, m7
17821pmaddubsw m5, m2, m6
17822pmulhrsw m5, m7
17823packuswb m3, m5
17824movu [r0 + 208 * 16], m3
17825
17826; mode 7 [row 16 - first half]
17827movu [r0 + 352 * 16], m3
17828
17829pmaddubsw m3, m1, m6
17830pmulhrsw m3, m7
17831pmaddubsw m5, m4, m6
17832pmulhrsw m5, m7
17833packuswb m3, m5
17834movu [r0 + 209 * 16], m3
17835
17836; mode 7 [row 16 - second half]
17837movu [r0 + 353 * 16], m3
17838
17839; mode 6 [row 10]
17840movu m6, [r5 + 15 * 16]
17841pmaddubsw m3, m0, m6
17842pmulhrsw m3, m7
17843pmaddubsw m5, m2, m6
17844pmulhrsw m5, m7
17845packuswb m3, m5
17846movu [r0 + 276 * 16], m3
17847pmaddubsw m3, m1, m6
17848pmulhrsw m3, m7
17849pmaddubsw m5, m4, m6
17850pmulhrsw m5, m7
17851packuswb m3, m5
17852movu [r0 + 277 * 16], m3
17853
17854; mode 7 [row 14]
17855movu m6, [r5 + 7 * 16]
17856pmaddubsw m3, m0, m6
17857pmulhrsw m3, m7
17858pmaddubsw m5, m2, m6
17859pmulhrsw m5, m7
17860packuswb m3, m5
17861movu [r0 + 348 * 16], m3
17862
17863; mode 8 [row 26 - first half]
17864movu [r0 + 436 * 16], m3
17865
17866pmaddubsw m3, m1, m6
17867pmulhrsw m3, m7
17868pmaddubsw m5, m4, m6
17869pmulhrsw m5, m7
17870packuswb m3, m5
17871movu [r0 + 349 * 16], m3
17872
17873; mode 8 [row 26 - second half]
17874movu [r0 + 437 * 16], m3
17875
17876; mode 7 [row 15]
17877movu m6, [r5 + 16 * 16]
17878pmaddubsw m3, m0, m6
17879pmulhrsw m3, m7
17880pmaddubsw m5, m2, m6
17881pmulhrsw m5, m7
17882packuswb m3, m5
17883movu [r0 + 350 * 16], m3
17884pmaddubsw m3, m1, m6
17885pmulhrsw m3, m7
17886pmaddubsw m5, m4, m6
17887pmulhrsw m5, m7
17888packuswb m3, m5
17889movu [r0 + 351 * 16], m3
17890
17891; mode 8 [row 27]
17892movu m6, [r5 + 12 * 16]
17893pmaddubsw m3, m0, m6
17894pmulhrsw m3, m7
17895pmaddubsw m5, m2, m6
17896pmulhrsw m5, m7
17897packuswb m3, m5
17898movu [r0 + 438 * 16], m3
17899pmaddubsw m3, m1, m6
17900pmulhrsw m3, m7
17901pmaddubsw m5, m4, m6
17902pmulhrsw m5, m7
17903packuswb m3, m5
17904movu [r0 + 439 * 16], m3
17905
17906; mode 8 [row 28]
17907movu m6, [r5 + 17 * 16]
17908pmaddubsw m3, m0, m6
17909pmulhrsw m3, m7
17910pmaddubsw m5, m2, m6
17911pmulhrsw m5, m7
17912packuswb m3, m5
17913movu [r0 + 440 * 16], m3
17914pmaddubsw m3, m1, m6
17915pmulhrsw m3, m7
17916pmaddubsw m5, m4, m6
17917pmulhrsw m5, m7
17918packuswb m3, m5
17919movu [r0 + 441 * 16], m3
17920
17921; mode 8 [row 29]
17922movu m6, [r5 + 22 * 16]
17923pmaddubsw m3, m0, m6
17924pmulhrsw m3, m7
17925pmaddubsw m5, m2, m6
17926pmulhrsw m5, m7
17927packuswb m3, m5
17928movu [r0 + 442 * 16], m3
17929pmaddubsw m3, m1, m6
17930pmulhrsw m3, m7
17931pmaddubsw m5, m4, m6
17932pmulhrsw m5, m7
17933packuswb m3, m5
17934movu [r0 + 443 * 16], m3
17935
17936; mode 8 [row 30]
17937movu m6, [r5 + 27 * 16]
17938pmaddubsw m3, m0, m6
17939pmulhrsw m3, m7
17940pmaddubsw m5, m2, m6
17941pmulhrsw m5, m7
17942packuswb m3, m5
17943movu [r0 + 444 * 16], m3
17944pmaddubsw m3, m1, m6
17945pmulhrsw m3, m7
17946pmaddubsw m5, m4, m6
17947pmulhrsw m5, m7
17948packuswb m3, m5
17949movu [r0 + 445 * 16], m3
17950
17951; mode 3 [row 6]
17952movu m6, [r5 + 22 * 16]
17953movu m0, [r4 + 6]
17954movd m1, [r4 + 7]
17955palignr m1, m0, 1
17956punpcklbw m0, m1
17957pmaddubsw m1, m0, m6
17958pmulhrsw m1, m7
17959movu m2, [r4 + 14]
17960movd m3, [r4 + 15]
17961palignr m3, m2, 1
17962punpcklbw m2, m3
17963pmaddubsw m3, m2, m6
17964pmulhrsw m3, m7
17965packuswb m1, m3
17966movu [r0 + 76 * 16], m1
17967
17968; mode 6 [row 13 - first half]
17969movu [r0 + 282 * 16], m1
17970
17971movu m1, [r4 + 22]
17972movd m3, [r4 + 23]
17973palignr m3, m1, 1
17974punpcklbw m1, m3
17975pmaddubsw m3, m1, m6
17976pmulhrsw m3, m7
17977movu m4, [r4 + 30]
17978movd m5, [r4 + 31]
17979palignr m5, m4, 1
17980punpcklbw m4, m5
17981pmaddubsw m5, m4, m6
17982pmulhrsw m5, m7
17983packuswb m3, m5
17984movu [r0 + 77 * 16], m3
17985
17986; mode 6 [row 13 - second half]
17987movu [r0 + 283 * 16], m3
17988
17989; mode 4 [row 7]
17990movu m6, [r5 + 8 * 16]
17991pmaddubsw m3, m0, m6
17992pmulhrsw m3, m7
17993pmaddubsw m5, m2, m6
17994pmulhrsw m5, m7
17995packuswb m3, m5
17996movu [r0 + 142 * 16], m3
17997pmaddubsw m3, m1, m6
17998pmulhrsw m3, m7
17999pmaddubsw m5, m4, m6
18000pmulhrsw m5, m7
18001packuswb m3, m5
18002movu [r0 + 143 * 16], m3
18003
18004; mode 4 [row 8]
18005movu m6, [r5 + 29 * 16]
18006pmaddubsw m3, m0, m6
18007pmulhrsw m3, m7
18008pmaddubsw m5, m2, m6
18009pmulhrsw m5, m7
18010packuswb m3, m5
18011movu [r0 + 144 * 16], m3
18012
18013; mode 4 [row 8 - first half]
18014movu [r0 + 360 * 16], m3
18015
18016pmaddubsw m3, m1, m6
18017pmulhrsw m3, m7
18018pmaddubsw m5, m4, m6
18019pmulhrsw m5, m7
18020packuswb m3, m5
18021movu [r0 + 145 * 16], m3
18022
18023; mode 4 [row 8 - second half]
18024movu [r0 + 361 * 16], m3
18025
18026; mode 5 [row 9]
18027movu m6, [r5 + 10 * 16]
18028pmaddubsw m3, m0, m6
18029pmulhrsw m3, m7
18030pmaddubsw m5, m2, m6
18031pmulhrsw m5, m7
18032packuswb m3, m5
18033movu [r0 + 210 * 16], m3
18034pmaddubsw m3, m1, m6
18035pmulhrsw m3, m7
18036pmaddubsw m5, m4, m6
18037pmulhrsw m5, m7
18038packuswb m3, m5
18039movu [r0 + 211 * 16], m3
18040
18041; mode 5 [row 10]
18042movu m6, [r5 + 27 * 16]
18043pmaddubsw m3, m0, m6
18044pmulhrsw m3, m7
18045pmaddubsw m5, m2, m6
18046pmulhrsw m5, m7
18047packuswb m3, m5
18048movu [r0 + 212 * 16], m3
18049pmaddubsw m3, m1, m6
18050pmulhrsw m3, m7
18051pmaddubsw m5, m4, m6
18052pmulhrsw m5, m7
18053packuswb m3, m5
18054movu [r0 + 213 * 16], m3
18055
18056; mode 7 [row 17]
18057movu m6, [r5 + 2 * 16]
18058pmaddubsw m3, m0, m6
18059pmulhrsw m3, m7
18060pmaddubsw m5, m2, m6
18061pmulhrsw m5, m7
18062packuswb m3, m5
18063movu [r0 + 354 * 16], m3
18064pmaddubsw m3, m1, m6
18065pmulhrsw m3, m7
18066pmaddubsw m5, m4, m6
18067pmulhrsw m5, m7
18068packuswb m3, m5
18069movu [r0 + 355 * 16], m3
18070
18071; mode 7 [row 18]
18072movu m6, [r5 + 11 * 16]
18073pmaddubsw m3, m0, m6
18074pmulhrsw m3, m7
18075pmaddubsw m5, m2, m6
18076pmulhrsw m5, m7
18077packuswb m3, m5
18078movu [r0 + 356 * 16], m3
18079pmaddubsw m3, m1, m6
18080pmulhrsw m3, m7
18081pmaddubsw m5, m4, m6
18082pmulhrsw m5, m7
18083packuswb m3, m5
18084movu [r0 + 357 * 16], m3
18085
18086; mode 7 [row 19]
18087movu m6, [r5 + 20 * 16]
18088pmaddubsw m3, m0, m6
18089pmulhrsw m3, m7
18090pmaddubsw m5, m2, m6
18091pmulhrsw m5, m7
18092packuswb m3, m5
18093movu [r0 + 358 * 16], m3
18094pmaddubsw m3, m1, m6
18095pmulhrsw m3, m7
18096pmaddubsw m5, m4, m6
18097pmulhrsw m5, m7
18098packuswb m3, m5
18099movu [r0 + 359 * 16], m3
18100
18101; mode 6 [row 12]
18102movu m6, [r5 + 9 * 16]
18103pmaddubsw m3, m0, m6
18104pmulhrsw m3, m7
18105pmaddubsw m5, m2, m6
18106pmulhrsw m5, m7
18107packuswb m3, m5
18108movu [r0 + 280 * 16], m3
18109pmaddubsw m3, m1, m6
18110pmulhrsw m3, m7
18111pmaddubsw m5, m4, m6
18112pmulhrsw m5, m7
18113packuswb m3, m5
18114movu [r0 + 281 * 16], m3
18115
18116; mode 3 [row 7]
18117movu m6, [r5 + 16 * 16]
18118movu m0, [r4 + 7]
18119movd m1, [r4 + 8]
18120palignr m1, m0, 1
18121punpcklbw m0, m1
18122pmaddubsw m1, m0, m6
18123pmulhrsw m1, m7
18124movu m2, [r4 + 15]
18125movd m3, [r4 + 16]
18126palignr m3, m2, 1
18127punpcklbw m2, m3
18128pmaddubsw m3, m2, m6
18129pmulhrsw m3, m7
18130packuswb m1, m3
18131movu [r0 + 78 * 16], m1
18132
18133; mode 6 [row 15 - first half]
18134movu [r0 + 286 * 16], m1
18135
18136movu m1, [r4 + 23]
18137movd m3, [r4 + 24]
18138palignr m3, m1, 1
18139punpcklbw m1, m3
18140pmaddubsw m3, m1, m6
18141pmulhrsw m3, m7
18142movu m4, [r4 + 31]
18143movd m5, [r4 + 32]
18144palignr m5, m4, 1
18145punpcklbw m4, m5
18146pmaddubsw m5, m4, m6
18147pmulhrsw m5, m7
18148packuswb m3, m5
18149movu [r0 + 79 * 16], m3
18150
18151; mode 6 [row 15 - second half]
18152movu [r0 + 287 * 16], m3
18153
18154; mode 4 [row 9]
18155movu m6, [r5 + 18 * 16]
18156pmaddubsw m3, m0, m6
18157pmulhrsw m3, m7
18158pmaddubsw m5, m2, m6
18159pmulhrsw m5, m7
18160packuswb m3, m5
18161movu [r0 + 146 * 16], m3
18162pmaddubsw m3, m1, m6
18163pmulhrsw m3, m7
18164pmaddubsw m5, m4, m6
18165pmulhrsw m5, m7
18166packuswb m3, m5
18167movu [r0 + 147 * 16], m3
18168
18169; mode 5 [row 11]
18170movu m6, [r5 + 12 * 16]
18171pmaddubsw m3, m0, m6
18172pmulhrsw m3, m7
18173pmaddubsw m5, m2, m6
18174pmulhrsw m5, m7
18175packuswb m3, m5
18176movu [r0 + 214 * 16], m3
18177pmaddubsw m3, m1, m6
18178pmulhrsw m3, m7
18179pmaddubsw m5, m4, m6
18180pmulhrsw m5, m7
18181packuswb m3, m5
18182movu [r0 + 215 * 16], m3
18183
18184; mode 5 [row 12]
18185movu m6, [r5 + 29 * 16]
18186pmaddubsw m3, m0, m6
18187pmulhrsw m3, m7
18188pmaddubsw m5, m2, m6
18189pmulhrsw m5, m7
18190packuswb m3, m5
18191movu [r0 + 216 * 16], m3
18192
18193; mode 6 [row 16 - first half]
18194movu [r0 + 288 * 16], m3
18195
18196pmaddubsw m3, m1, m6
18197pmulhrsw m3, m7
18198pmaddubsw m5, m4, m6
18199pmulhrsw m5, m7
18200packuswb m3, m5
18201movu [r0 + 217 * 16], m3
18202
18203; mode 6 [row 16 - second half]
18204movu [r0 + 289 * 16], m3
18205
18206; mode 6 [row 14]
18207movu m6, [r5 + 3 * 16]
18208pmaddubsw m3, m0, m6
18209pmulhrsw m3, m7
18210pmaddubsw m5, m2, m6
18211pmulhrsw m5, m7
18212packuswb m3, m5
18213movu [r0 + 284 * 16], m3
18214pmaddubsw m3, m1, m6
18215pmulhrsw m3, m7
18216pmaddubsw m5, m4, m6
18217pmulhrsw m5, m7
18218packuswb m3, m5
18219movu [r0 + 285 * 16], m3
18220
18221; mode 7 [row 21]
18222movu m6, [r5 + 6 * 16]
18223pmaddubsw m3, m0, m6
18224pmulhrsw m3, m7
18225pmaddubsw m5, m2, m6
18226pmulhrsw m5, m7
18227packuswb m3, m5
18228movu [r0 + 362 * 16], m3
18229pmaddubsw m3, m1, m6
18230pmulhrsw m3, m7
18231pmaddubsw m5, m4, m6
18232pmulhrsw m5, m7
18233packuswb m3, m5
18234movu [r0 + 363 * 16], m3
18235
18236; mode 7 [row 22]
18237movu m6, [r5 + 15 * 16]
18238pmaddubsw m3, m0, m6
18239pmulhrsw m3, m7
18240pmaddubsw m5, m2, m6
18241pmulhrsw m5, m7
18242packuswb m3, m5
18243movu [r0 + 364 * 16], m3
18244pmaddubsw m3, m1, m6
18245pmulhrsw m3, m7
18246pmaddubsw m5, m4, m6
18247pmulhrsw m5, m7
18248packuswb m3, m5
18249movu [r0 + 365 * 16], m3
18250
18251; mode 7 [row 23]
18252movu m6, [r5 + 24 * 16]
18253pmaddubsw m3, m0, m6
18254pmulhrsw m3, m7
18255pmaddubsw m5, m2, m6
18256pmulhrsw m5, m7
18257packuswb m3, m5
18258movu [r0 + 366 * 16], m3
18259pmaddubsw m3, m1, m6
18260pmulhrsw m3, m7
18261pmaddubsw m5, m4, m6
18262pmulhrsw m5, m7
18263packuswb m3, m5
18264movu [r0 + 367 * 16], m3
18265
18266; mode 3 [row 8]
18267movu m6, [r5 + 10 * 16]
18268movu m0, [r4 + 8]
18269movd m1, [r4 + 9]
18270palignr m1, m0, 1
18271punpcklbw m0, m1
18272pmaddubsw m1, m0, m6
18273pmulhrsw m1, m7
18274movu m2, [r4 + 16]
18275movd m3, [r4 + 17]
18276palignr m3, m2, 1
18277punpcklbw m2, m3
18278pmaddubsw m3, m2, m6
18279pmulhrsw m3, m7
18280packuswb m1, m3
18281movu [r0 + 80 * 16], m1
18282
18283; mode 7 [row 25 - first half]
18284movu [r0 + 290 * 16], m1
18285
18286; mode 6 [row 17 - first half]
18287movu [r0 + 370 * 16], m1
18288
18289movu m1, [r4 + 24]
18290movd m3, [r4 + 25]
18291palignr m3, m1, 1
18292punpcklbw m1, m3
18293pmaddubsw m3, m1, m6
18294pmulhrsw m3, m7
18295movu m4, [r4 + 32]
18296movd m5, [r4 + 33]
18297palignr m5, m4, 1
18298punpcklbw m4, m5
18299pmaddubsw m5, m4, m6
18300pmulhrsw m5, m7
18301packuswb m3, m5
18302movu [r0 + 81 * 16], m3
18303
18304; mode 7 [row 25 - second half]
18305movu [r0 + 291 * 16], m3
18306
18307; mode 6 [row 17 - second half]
18308movu [r0 + 371 * 16], m3
18309
18310; mode 4 [row 10]
18311movu m6, [r5 + 7 * 16]
18312pmaddubsw m3, m0, m6
18313pmulhrsw m3, m7
18314pmaddubsw m5, m2, m6
18315pmulhrsw m5, m7
18316packuswb m3, m5
18317movu [r0 + 148 * 16], m3
18318pmaddubsw m3, m1, m6
18319pmulhrsw m3, m7
18320pmaddubsw m5, m4, m6
18321pmulhrsw m5, m7
18322packuswb m3, m5
18323movu [r0 + 149 * 16], m3
18324
18325; mode 4 [row 11]
18326movu m6, [r5 + 28 * 16]
18327pmaddubsw m3, m0, m6
18328pmulhrsw m3, m7
18329pmaddubsw m5, m2, m6
18330pmulhrsw m5, m7
18331packuswb m3, m5
18332movu [r0 + 150 * 16], m3
18333
18334; mode 7 [row 27 - first half]
18335movu [r0 + 374 * 16], m3
18336
18337pmaddubsw m3, m1, m6
18338pmulhrsw m3, m7
18339pmaddubsw m5, m4, m6
18340pmulhrsw m5, m7
18341packuswb m3, m5
18342movu [r0 + 151 * 16], m3
18343
18344; mode 7 [row 27 - second half]
18345movu [r0 + 375 * 16], m3
18346
18347; mode 5 [row 13]
18348movu m6, [r5 + 14 * 16]
18349pmaddubsw m3, m0, m6
18350pmulhrsw m3, m7
18351pmaddubsw m5, m2, m6
18352pmulhrsw m5, m7
18353packuswb m3, m5
18354movu [r0 + 218 * 16], m3
18355pmaddubsw m3, m1, m6
18356pmulhrsw m3, m7
18357pmaddubsw m5, m4, m6
18358pmulhrsw m5, m7
18359packuswb m3, m5
18360movu [r0 + 219 * 16], m3
18361
18362; mode 5 [row 14]
18363movu m6, [r5 + 31 * 16]
18364pmaddubsw m3, m0, m6
18365pmulhrsw m3, m7
18366pmaddubsw m5, m2, m6
18367pmulhrsw m5, m7
18368packuswb m3, m5
18369movu [r0 + 220 * 16], m3
18370pmaddubsw m3, m1, m6
18371pmulhrsw m3, m7
18372pmaddubsw m5, m4, m6
18373pmulhrsw m5, m7
18374packuswb m3, m5
18375movu [r0 + 221 * 16], m3
18376
18377; mode 6 [row 18]
18378movu m6, [r5 + 23 * 16]
18379pmaddubsw m3, m0, m6
18380pmulhrsw m3, m7
18381pmaddubsw m5, m2, m6
18382pmulhrsw m5, m7
18383packuswb m3, m5
18384movu [r0 + 292 * 16], m3
18385pmaddubsw m3, m1, m6
18386pmulhrsw m3, m7
18387pmaddubsw m5, m4, m6
18388pmulhrsw m5, m7
18389packuswb m3, m5
18390movu [r0 + 293 * 16], m3
18391
18392; mode 7 [row 24]
18393movu m6, [r5 + 1 * 16]
18394pmaddubsw m3, m0, m6
18395pmulhrsw m3, m7
18396pmaddubsw m5, m2, m6
18397pmulhrsw m5, m7
18398packuswb m3, m5
18399movu [r0 + 368 * 16], m3
18400pmaddubsw m3, m1, m6
18401pmulhrsw m3, m7
18402pmaddubsw m5, m4, m6
18403pmulhrsw m5, m7
18404packuswb m3, m5
18405movu [r0 + 369 * 16], m3
18406
18407; mode 7 [row 26]
18408movu m6, [r5 + 19 * 16]
18409pmaddubsw m3, m0, m6
18410pmulhrsw m3, m7
18411pmaddubsw m5, m2, m6
18412pmulhrsw m5, m7
18413packuswb m3, m5
18414movu [r0 + 372 * 16], m3
18415pmaddubsw m3, m1, m6
18416pmulhrsw m3, m7
18417pmaddubsw m5, m4, m6
18418pmulhrsw m5, m7
18419packuswb m3, m5
18420movu [r0 + 373 * 16], m3
18421
18422; mode 3 [row 9]
18423movu m6, [r5 + 4 * 16]
18424movu m0, [r4 + 9]
18425movd m1, [r4 + 10]
18426palignr m1, m0, 1
18427punpcklbw m0, m1
18428pmaddubsw m1, m0, m6
18429pmulhrsw m1, m7
18430movu m2, [r4 + 17]
18431movd m3, [r4 + 18]
18432palignr m3, m2, 1
18433punpcklbw m2, m3
18434pmaddubsw m3, m2, m6
18435pmulhrsw m3, m7
18436packuswb m1, m3
18437movu [r0 + 82 * 16], m1
18438
18439; mode 6 [row 19 - first half]
18440movu [r0 + 294 * 16], m1
18441
18442movu m1, [r4 + 25]
18443movd m3, [r4 + 26]
18444palignr m3, m1, 1
18445punpcklbw m1, m3
18446pmaddubsw m3, m1, m6
18447pmulhrsw m3, m7
18448movu m4, [r4 + 33]
18449movd m5, [r4 + 34]
18450palignr m5, m4, 1
18451punpcklbw m4, m5
18452pmaddubsw m5, m4, m6
18453pmulhrsw m5, m7
18454packuswb m3, m5
18455movu [r0 + 83 * 16], m3
18456
18457; mode 6 [row 19 - second half]
18458movu [r0 + 295 * 16], m3
18459
18460; mode 4 [row 12]
18461movu m6, [r5 + 17 * 16]
18462pmaddubsw m3, m0, m6
18463pmulhrsw m3, m7
18464pmaddubsw m5, m2, m6
18465pmulhrsw m5, m7
18466packuswb m3, m5
18467movu [r0 + 152 * 16], m3
18468
18469; mode 4 [row 12 - first half]
18470movu [r0 + 296 * 16], m3
18471
18472pmaddubsw m3, m1, m6
18473pmulhrsw m3, m7
18474pmaddubsw m5, m4, m6
18475pmulhrsw m5, m7
18476packuswb m3, m5
18477movu [r0 + 153 * 16], m3
18478
18479; mode 4 [row 12 - second half]
18480movu [r0 + 297 * 16], m3
18481
18482; mode 3 [row 10]
18483movu m6, [r5 + 30 * 16]
18484pmaddubsw m3, m0, m6
18485pmulhrsw m3, m7
18486pmaddubsw m5, m2, m6
18487pmulhrsw m5, m7
18488packuswb m3, m5
18489movu [r0 + 84 * 16], m3
18490
18491; mode 6 [row 21 - first half]
18492movu [r0 + 298 * 16], m3
18493
18494pmaddubsw m3, m1, m6
18495pmulhrsw m3, m7
18496pmaddubsw m5, m4, m6
18497pmulhrsw m5, m7
18498packuswb m3, m5
18499movu [r0 + 85 * 16], m3
18500
18501; mode 6 [row 21 - second half]
18502movu [r0 + 299 * 16], m3
18503
18504; mode 5 [row 15]
18505movu m6, [r5 + 16 * 16]
18506pmaddubsw m3, m0, m6
18507pmulhrsw m3, m7
18508pmaddubsw m5, m2, m6
18509pmulhrsw m5, m7
18510packuswb m3, m5
18511movu [r0 + 222 * 16], m3
18512pmaddubsw m3, m1, m6
18513pmulhrsw m3, m7
18514pmaddubsw m5, m4, m6
18515pmulhrsw m5, m7
18516packuswb m3, m5
18517movu [r0 + 223 * 16], m3
18518
18519; mode 7 [row 28]
18520movu m6, [r5 + 5 * 16]
18521pmaddubsw m3, m0, m6
18522pmulhrsw m3, m7
18523pmaddubsw m5, m2, m6
18524pmulhrsw m5, m7
18525packuswb m3, m5
18526movu [r0 + 376 * 16], m3
18527pmaddubsw m3, m1, m6
18528pmulhrsw m3, m7
18529pmaddubsw m5, m4, m6
18530pmulhrsw m5, m7
18531packuswb m3, m5
18532movu [r0 + 377 * 16], m3
18533
18534; mode 7 [row 29]
18535movu m6, [r5 + 14 * 16]
18536pmaddubsw m3, m0, m6
18537pmulhrsw m3, m7
18538pmaddubsw m5, m2, m6
18539pmulhrsw m5, m7
18540packuswb m3, m5
18541movu [r0 + 378 * 16], m3
18542pmaddubsw m3, m1, m6
18543pmulhrsw m3, m7
18544pmaddubsw m5, m4, m6
18545pmulhrsw m5, m7
18546packuswb m3, m5
18547movu [r0 + 379 * 16], m3
18548
18549; mode 7 [row 30]
18550movu m6, [r5 + 23 * 16]
18551pmaddubsw m3, m0, m6
18552pmulhrsw m3, m7
18553pmaddubsw m5, m2, m6
18554pmulhrsw m5, m7
18555packuswb m3, m5
18556movu [r0 + 380 * 16], m3
18557pmaddubsw m3, m1, m6
18558pmulhrsw m3, m7
18559pmaddubsw m5, m4, m6
18560pmulhrsw m5, m7
18561packuswb m3, m5
18562movu [r0 + 381 * 16], m3
18563
18564; mode 3 [row 11]
18565movu m6, [r5 + 24 * 16]
18566movu m0, [r4 + 10]
18567movd m1, [r4 + 11]
18568palignr m1, m0, 1
18569punpcklbw m0, m1
18570pmaddubsw m1, m0, m6
18571pmulhrsw m1, m7
18572movu m2, [r4 + 18]
18573movd m3, [r4 + 19]
18574palignr m3, m2, 1
18575punpcklbw m2, m3
18576pmaddubsw m3, m2, m6
18577pmulhrsw m3, m7
18578packuswb m1, m3
18579movu [r0 + 86 * 16], m1
18580
18581; mode 6 [row 23 - first half]
18582movu [r0 + 302 * 16], m1
18583
18584movu m1, [r4 + 26]
18585movd m3, [r4 + 27]
18586palignr m3, m1, 1
18587punpcklbw m1, m3
18588pmaddubsw m3, m1, m6
18589pmulhrsw m3, m7
18590movu m4, [r4 + 34]
18591movd m5, [r4 + 35]
18592palignr m5, m4, 1
18593punpcklbw m4, m5
18594pmaddubsw m5, m4, m6
18595pmulhrsw m5, m7
18596packuswb m3, m5
18597movu [r0 + 87 * 16], m3
18598
18599; mode 6 [row 23 - second half]
18600movu [r0 + 303 * 16], m3
18601
18602; mode 4 [row 13]
18603movu m6, [r5 + 6 * 16]
18604pmaddubsw m3, m0, m6
18605pmulhrsw m3, m7
18606pmaddubsw m5, m2, m6
18607pmulhrsw m5, m7
18608packuswb m3, m5
18609movu [r0 + 154 * 16], m3
18610pmaddubsw m3, m1, m6
18611pmulhrsw m3, m7
18612pmaddubsw m5, m4, m6
18613pmulhrsw m5, m7
18614packuswb m3, m5
18615movu [r0 + 155 * 16], m3
18616
18617; mode 4 [row 14]
18618movu m6, [r5 + 27 * 16]
18619pmaddubsw m3, m0, m6
18620pmulhrsw m3, m7
18621pmaddubsw m5, m2, m6
18622pmulhrsw m5, m7
18623packuswb m3, m5
18624movu [r0 + 156 * 16], m3
18625pmaddubsw m3, m1, m6
18626pmulhrsw m3, m7
18627pmaddubsw m5, m4, m6
18628pmulhrsw m5, m7
18629packuswb m3, m5
18630movu [r0 + 157 * 16], m3
18631
18632; mode 5 [row 16]
18633movu m6, [r5 + 1 * 16]
18634pmaddubsw m3, m0, m6
18635pmulhrsw m3, m7
18636pmaddubsw m5, m2, m6
18637pmulhrsw m5, m7
18638packuswb m3, m5
18639movu [r0 + 224 * 16], m3
18640pmaddubsw m3, m1, m6
18641pmulhrsw m3, m7
18642pmaddubsw m5, m4, m6
18643pmulhrsw m5, m7
18644packuswb m3, m5
18645movu [r0 + 225 * 16], m3
18646
18647; mode 5 [row 17]
18648movu m6, [r5 + 18 * 16]
18649pmaddubsw m3, m0, m6
18650pmulhrsw m3, m7
18651pmaddubsw m5, m2, m6
18652pmulhrsw m5, m7
18653packuswb m3, m5
18654movu [r0 + 226 * 16], m3
18655pmaddubsw m3, m1, m6
18656pmulhrsw m3, m7
18657pmaddubsw m5, m4, m6
18658pmulhrsw m5, m7
18659packuswb m3, m5
18660movu [r0 + 227 * 16], m3
18661
18662; mode 6 [row 22]
18663movu m6, [r5 + 11 * 16]
18664pmaddubsw m3, m0, m6
18665pmulhrsw m3, m7
18666pmaddubsw m5, m2, m6
18667pmulhrsw m5, m7
18668packuswb m3, m5
18669movu [r0 + 300 * 16], m3
18670pmaddubsw m3, m1, m6
18671pmulhrsw m3, m7
18672pmaddubsw m5, m4, m6
18673pmulhrsw m5, m7
18674packuswb m3, m5
18675movu [r0 + 301 * 16], m3
18676
18677; mode 3 [row 12]
18678movu m6, [r5 + 18 * 16]
18679movu m0, [r4 + 11]
18680movd m1, [r4 + 12]
18681palignr m1, m0, 1
18682punpcklbw m0, m1
18683pmaddubsw m1, m0, m6
18684pmulhrsw m1, m7
18685movu m2, [r4 + 19]
18686movd m3, [r4 + 20]
18687palignr m3, m2, 1
18688punpcklbw m2, m3
18689pmaddubsw m3, m2, m6
18690pmulhrsw m3, m7
18691packuswb m1, m3
18692movu [r0 + 88 * 16], m1
18693
18694; mode 6 [row 25 - first half]
18695movu [r0 + 306 * 16], m1
18696
18697movu m1, [r4 + 27]
18698movd m3, [r4 + 28]
18699palignr m3, m1, 1
18700punpcklbw m1, m3
18701pmaddubsw m3, m1, m6
18702pmulhrsw m3, m7
18703movu m4, [r4 + 35]
18704movd m5, [r4 + 36]
18705palignr m5, m4, 1
18706punpcklbw m4, m5
18707pmaddubsw m5, m4, m6
18708pmulhrsw m5, m7
18709packuswb m3, m5
18710movu [r0 + 89 * 16], m3
18711
18712; mode 6 [row 25 - second half]
18713movu [r0 + 307 * 16], m3
18714
18715; mode 4 [row 15]
18716movu m6, [r5 + 16 * 16]
18717pmaddubsw m3, m0, m6
18718pmulhrsw m3, m7
18719pmaddubsw m5, m2, m6
18720pmulhrsw m5, m7
18721packuswb m3, m5
18722movu [r0 + 158 * 16], m3
18723pmaddubsw m3, m1, m6
18724pmulhrsw m3, m7
18725pmaddubsw m5, m4, m6
18726pmulhrsw m5, m7
18727packuswb m3, m5
18728movu [r0 + 159 * 16], m3
18729
18730; mode 5 [row 18]
18731movu m6, [r5 + 3 * 16]
18732pmaddubsw m3, m0, m6
18733pmulhrsw m3, m7
18734pmaddubsw m5, m2, m6
18735pmulhrsw m5, m7
18736packuswb m3, m5
18737movu [r0 + 228 * 16], m3
18738pmaddubsw m3, m1, m6
18739pmulhrsw m3, m7
18740pmaddubsw m5, m4, m6
18741pmulhrsw m5, m7
18742packuswb m3, m5
18743movu [r0 + 229 * 16], m3
18744
18745; mode 5 [row 19]
18746movu m6, [r5 + 20 * 16]
18747pmaddubsw m3, m0, m6
18748pmulhrsw m3, m7
18749pmaddubsw m5, m2, m6
18750pmulhrsw m5, m7
18751packuswb m3, m5
18752movu [r0 + 230 * 16], m3
18753pmaddubsw m3, m1, m6
18754pmulhrsw m3, m7
18755pmaddubsw m5, m4, m6
18756pmulhrsw m5, m7
18757packuswb m3, m5
18758movu [r0 + 231 * 16], m3
18759
18760; mode 6 [row 24]
18761movu m6, [r5 + 5 * 16]
18762pmaddubsw m3, m0, m6
18763pmulhrsw m3, m7
18764pmaddubsw m5, m2, m6
18765pmulhrsw m5, m7
18766packuswb m3, m5
18767movu [r0 + 304 * 16], m3
18768pmaddubsw m3, m1, m6
18769pmulhrsw m3, m7
18770pmaddubsw m5, m4, m6
18771pmulhrsw m5, m7
18772packuswb m3, m5
18773movu [r0 + 305 * 16], m3
18774
18775; mode 6 [row 26]
18776movu m6, [r5 + 31 * 16]
18777pmaddubsw m3, m0, m6
18778pmulhrsw m3, m7
18779pmaddubsw m5, m2, m6
18780pmulhrsw m5, m7
18781packuswb m3, m5
18782movu [r0 + 308 * 16], m3
18783pmaddubsw m3, m1, m6
18784pmulhrsw m3, m7
18785pmaddubsw m5, m4, m6
18786pmulhrsw m5, m7
18787packuswb m3, m5
18788movu [r0 + 309 * 16], m3
18789
18790; mode 3 [row 13]
18791movu m6, [r5 + 12 * 16]
18792movu m0, [r4 + 12]
18793movd m1, [r4 + 13]
18794palignr m1, m0, 1
18795punpcklbw m0, m1
18796pmaddubsw m1, m0, m6
18797pmulhrsw m1, m7
18798movu m2, [r4 + 20]
18799movd m3, [r4 + 21]
18800palignr m3, m2, 1
18801punpcklbw m2, m3
18802pmaddubsw m3, m2, m6
18803pmulhrsw m3, m7
18804packuswb m1, m3
18805movu [r0 + 90 * 16], m1
18806
18807movu m1, [r4 + 28]
18808movd m3, [r4 + 29]
18809palignr m3, m1, 1
18810punpcklbw m1, m3
18811pmaddubsw m3, m1, m6
18812pmulhrsw m3, m7
18813movu m4, [r4 + 36]
18814movd m5, [r4 + 37]
18815palignr m5, m4, 1
18816punpcklbw m4, m5
18817pmaddubsw m5, m4, m6
18818pmulhrsw m5, m7
18819packuswb m3, m5
18820movu [r0 + 91 * 16], m3
18821
18822; mode 4 [row 16]
18823movu m6, [r5 + 5 * 16]
18824pmaddubsw m3, m0, m6
18825pmulhrsw m3, m7
18826pmaddubsw m5, m2, m6
18827pmulhrsw m5, m7
18828packuswb m3, m5
18829movu [r0 + 160 * 16], m3
18830
18831; mode 5 [row 20 - first half]
18832movu [r0 + 232 * 16], m3
18833
18834pmaddubsw m3, m1, m6
18835pmulhrsw m3, m7
18836pmaddubsw m5, m4, m6
18837pmulhrsw m5, m7
18838packuswb m3, m5
18839movu [r0 + 161 * 16], m3
18840
18841; mode 5 [row 20 - second half]
18842movu [r0 + 233 * 16], m3
18843
18844; mode 4 [row 17]
18845movu m6, [r5 + 26 * 16]
18846pmaddubsw m3, m0, m6
18847pmulhrsw m3, m7
18848pmaddubsw m5, m2, m6
18849pmulhrsw m5, m7
18850packuswb m3, m5
18851movu [r0 + 162 * 16], m3
18852pmaddubsw m3, m1, m6
18853pmulhrsw m3, m7
18854pmaddubsw m5, m4, m6
18855pmulhrsw m5, m7
18856packuswb m3, m5
18857movu [r0 + 163 * 16], m3
18858
18859; mode 5 [row 21]
18860movu m6, [r5 + 22 * 16]
18861pmaddubsw m3, m0, m6
18862pmulhrsw m3, m7
18863pmaddubsw m5, m2, m6
18864pmulhrsw m5, m7
18865packuswb m3, m5
18866movu [r0 + 234 * 16], m3
18867pmaddubsw m3, m1, m6
18868pmulhrsw m3, m7
18869pmaddubsw m5, m4, m6
18870pmulhrsw m5, m7
18871packuswb m3, m5
18872movu [r0 + 235 * 16], m3
18873
18874; mode 6 [row 27]
18875movu m6, [r5 + 12 * 16]
18876pmaddubsw m3, m0, m6
18877pmulhrsw m3, m7
18878pmaddubsw m5, m2, m6
18879pmulhrsw m5, m7
18880packuswb m3, m5
18881movu [r0 + 310 * 16], m3
18882pmaddubsw m3, m1, m6
18883pmulhrsw m3, m7
18884pmaddubsw m5, m4, m6
18885pmulhrsw m5, m7
18886packuswb m3, m5
18887movu [r0 + 311 * 16], m3
18888
18889; mode 6 [row 28]
18890movu m6, [r5 + 25 * 16]
18891pmaddubsw m3, m0, m6
18892pmulhrsw m3, m7
18893pmaddubsw m5, m2, m6
18894pmulhrsw m5, m7
18895packuswb m3, m5
18896movu [r0 + 312 * 16], m3
18897pmaddubsw m3, m1, m6
18898pmulhrsw m3, m7
18899pmaddubsw m5, m4, m6
18900pmulhrsw m5, m7
18901packuswb m3, m5
18902movu [r0 + 313 * 16], m3
18903
18904; mode 3 [row 14]
18905movu m6, [r5 + 6 * 16]
18906movu m0, [r4 + 13]
18907movd m1, [r4 + 14]
18908palignr m1, m0, 1
18909punpcklbw m0, m1
18910pmaddubsw m1, m0, m6
18911pmulhrsw m1, m7
18912movu m2, [r4 + 21]
18913movd m3, [r4 + 22]
18914palignr m3, m2, 1
18915punpcklbw m2, m3
18916pmaddubsw m3, m2, m6
18917pmulhrsw m3, m7
18918packuswb m1, m3
18919movu [r0 + 92 * 16], m1
18920
18921; mode 6 [row 29 - first half]
18922movu [r0 + 314 * 16], m1
18923
18924movu m1, [r4 + 29]
18925movd m3, [r4 + 30]
18926palignr m3, m1, 1
18927punpcklbw m1, m3
18928pmaddubsw m3, m1, m6
18929pmulhrsw m3, m7
18930movu m4, [r4 + 37]
18931movd m5, [r4 + 38]
18932palignr m5, m4, 1
18933punpcklbw m4, m5
18934pmaddubsw m5, m4, m6
18935pmulhrsw m5, m7
18936packuswb m3, m5
18937movu [r0 + 93 * 16], m3
18938
18939; mode 6 [row 29 - second half]
18940movu [r0 + 315 * 16], m3
18941
18942; mode 4 [row 18]
18943movu m6, [r5 + 15 * 16]
18944pmaddubsw m3, m0, m6
18945pmulhrsw m3, m7
18946pmaddubsw m5, m2, m6
18947pmulhrsw m5, m7
18948packuswb m3, m5
18949movu [r0 + 164 * 16], m3
18950pmaddubsw m3, m1, m6
18951pmulhrsw m3, m7
18952pmaddubsw m5, m4, m6
18953pmulhrsw m5, m7
18954packuswb m3, m5
18955movu [r0 + 165 * 16], m3
18956
18957; mode 5 [row 22]
18958movu m6, [r5 + 7 * 16]
18959pmaddubsw m3, m0, m6
18960pmulhrsw m3, m7
18961pmaddubsw m5, m2, m6
18962pmulhrsw m5, m7
18963packuswb m3, m5
18964movu [r0 + 236 * 16], m3
18965pmaddubsw m3, m1, m6
18966pmulhrsw m3, m7
18967pmaddubsw m5, m4, m6
18968pmulhrsw m5, m7
18969packuswb m3, m5
18970movu [r0 + 237 * 16], m3
18971
18972; mode 5 [row 23]
18973movu m6, [r5 + 24 * 16]
18974pmaddubsw m3, m0, m6
18975pmulhrsw m3, m7
18976pmaddubsw m5, m2, m6
18977pmulhrsw m5, m7
18978packuswb m3, m5
18979movu [r0 + 238 * 16], m3
18980pmaddubsw m3, m1, m6
18981pmulhrsw m3, m7
18982pmaddubsw m5, m4, m6
18983pmulhrsw m5, m7
18984packuswb m3, m5
18985movu [r0 + 239 * 16], m3
18986
18987; mode 6 [row 30]
18988movu m6, [r5 + 19 * 16]
18989pmaddubsw m3, m0, m6
18990pmulhrsw m3, m7
18991pmaddubsw m5, m2, m6
18992pmulhrsw m5, m7
18993packuswb m3, m5
18994movu [r0 + 316 * 16], m3
18995pmaddubsw m3, m1, m6
18996pmulhrsw m3, m7
18997pmaddubsw m5, m4, m6
18998pmulhrsw m5, m7
18999packuswb m3, m5
19000movu [r0 + 317 * 16], m3
19001
19002; mode 3 [row 16]
19003movu m6, [r5 + 26 * 16]
19004movu m0, [r4 + 14]
19005movd m1, [r4 + 15]
19006palignr m1, m0, 1
19007punpcklbw m0, m1
19008pmaddubsw m1, m0, m6
19009pmulhrsw m1, m7
19010movu m2, [r4 + 22]
19011movd m3, [r4 + 23]
19012palignr m3, m2, 1
19013punpcklbw m2, m3
19014pmaddubsw m3, m2, m6
19015pmulhrsw m3, m7
19016packuswb m1, m3
19017movu [r0 + 96 * 16], m1
19018
19019; mode 5 [row 25 - first half]
19020movu [r0 + 242 * 16], m1
19021
19022movu m1, [r4 + 30]
19023movd m3, [r4 + 31]
19024palignr m3, m1, 1
19025punpcklbw m1, m3
19026pmaddubsw m3, m1, m6
19027pmulhrsw m3, m7
19028movu m4, [r4 + 38]
19029movd m5, [r4 + 39]
19030palignr m5, m4, 1
19031punpcklbw m4, m5
19032pmaddubsw m5, m4, m6
19033pmulhrsw m5, m7
19034packuswb m3, m5
19035movu [r0 + 97 * 16], m3
19036
19037; mode 5 [row 25 - second half]
19038movu [r0 + 243 * 16], m3
19039
19040; mode 4 [row 19]
19041movu m6, [r5 + 4 * 16]
19042pmaddubsw m3, m0, m6
19043pmulhrsw m3, m7
19044pmaddubsw m5, m2, m6
19045pmulhrsw m5, m7
19046packuswb m3, m5
19047movu [r0 + 166 * 16], m3
19048pmaddubsw m3, m1, m6
19049pmulhrsw m3, m7
19050pmaddubsw m5, m4, m6
19051pmulhrsw m5, m7
19052packuswb m3, m5
19053movu [r0 + 167 * 16], m3
19054
19055; mode 4 [row 20]
19056movu m6, [r5 + 25 * 16]
19057pmaddubsw m3, m0, m6
19058pmulhrsw m3, m7
19059pmaddubsw m5, m2, m6
19060pmulhrsw m5, m7
19061packuswb m3, m5
19062movu [r0 + 168 * 16], m3
19063pmaddubsw m3, m1, m6
19064pmulhrsw m3, m7
19065pmaddubsw m5, m4, m6
19066pmulhrsw m5, m7
19067packuswb m3, m5
19068movu [r0 + 169 * 16], m3
19069
19070; mode 5 [row 24]
19071movu m6, [r5 + 9 * 16]
19072pmaddubsw m3, m0, m6
19073pmulhrsw m3, m7
19074pmaddubsw m5, m2, m6
19075pmulhrsw m5, m7
19076packuswb m3, m5
19077movu [r0 + 240 * 16], m3
19078pmaddubsw m3, m1, m6
19079pmulhrsw m3, m7
19080pmaddubsw m5, m4, m6
19081pmulhrsw m5, m7
19082packuswb m3, m5
19083movu [r0 + 241 * 16], m3
19084
19085; mode 3 [row 17]
19086movu m6, [r5 + 20 * 16]
19087movu m0, [r4 + 15]
19088movd m1, [r4 + 16]
19089palignr m1, m0, 1
19090punpcklbw m0, m1
19091pmaddubsw m1, m0, m6
19092pmulhrsw m1, m7
19093movu m2, [r4 + 23]
19094movd m3, [r4 + 24]
19095palignr m3, m2, 1
19096punpcklbw m2, m3
19097pmaddubsw m3, m2, m6
19098pmulhrsw m3, m7
19099packuswb m1, m3
19100movu [r0 + 98 * 16], m1
19101
19102movu m1, [r4 + 31]
19103movd m3, [r4 + 32]
19104palignr m3, m1, 1
19105punpcklbw m1, m3
19106pmaddubsw m3, m1, m6
19107pmulhrsw m3, m7
19108movu m4, [r4 + 39]
19109movd m5, [r4 + 40]
19110palignr m5, m4, 1
19111punpcklbw m4, m5
19112pmaddubsw m5, m4, m6
19113pmulhrsw m5, m7
19114packuswb m3, m5
19115movu [r0 + 99 * 16], m3
19116
19117; mode 4 [row 21]
19118movu m6, [r5 + 14 * 16]
19119pmaddubsw m3, m0, m6
19120pmulhrsw m3, m7
19121pmaddubsw m5, m2, m6
19122pmulhrsw m5, m7
19123packuswb m3, m5
19124movu [r0 + 170 * 16], m3
19125pmaddubsw m3, m1, m6
19126pmulhrsw m3, m7
19127pmaddubsw m5, m4, m6
19128pmulhrsw m5, m7
19129packuswb m3, m5
19130movu [r0 + 171 * 16], m3
19131
19132; mode 5 [row 26]
19133movu m6, [r5 + 11 * 16]
19134pmaddubsw m3, m0, m6
19135pmulhrsw m3, m7
19136pmaddubsw m5, m2, m6
19137pmulhrsw m5, m7
19138packuswb m3, m5
19139movu [r0 + 244 * 16], m3
19140pmaddubsw m3, m1, m6
19141pmulhrsw m3, m7
19142pmaddubsw m5, m4, m6
19143pmulhrsw m5, m7
19144packuswb m3, m5
19145movu [r0 + 245 * 16], m3
19146
19147; mode 5 [row 27]
19148movu m6, [r5 + 28 * 16]
19149pmaddubsw m3, m0, m6
19150pmulhrsw m3, m7
19151pmaddubsw m5, m2, m6
19152pmulhrsw m5, m7
19153packuswb m3, m5
19154movu [r0 + 246 * 16], m3
19155pmaddubsw m3, m1, m6
19156pmulhrsw m3, m7
19157pmaddubsw m5, m4, m6
19158pmulhrsw m5, m7
19159packuswb m3, m5
19160movu [r0 + 247 * 16], m3
19161
19162; mode 3 [row 18]
19163movu m6, [r5 + 14 * 16]
19164movu m0, [r4 + 16]
19165movd m1, [r4 + 17]
19166palignr m1, m0, 1
19167punpcklbw m0, m1
19168pmaddubsw m1, m0, m6
19169pmulhrsw m1, m7
19170movu m2, [r4 + 24]
19171movd m3, [r4 + 25]
19172palignr m3, m2, 1
19173punpcklbw m2, m3
19174pmaddubsw m3, m2, m6
19175pmulhrsw m3, m7
19176packuswb m1, m3
19177movu [r0 + 100 * 16], m1
19178
19179movu m1, [r4 + 32]
19180movd m3, [r4 + 33]
19181palignr m3, m1, 1
19182punpcklbw m1, m3
19183pmaddubsw m3, m1, m6
19184pmulhrsw m3, m7
19185movu m4, [r4 + 40]
19186movd m5, [r4 + 41]
19187palignr m5, m4, 1
19188punpcklbw m4, m5
19189pmaddubsw m5, m4, m6
19190pmulhrsw m5, m7
19191packuswb m3, m5
19192movu [r0 + 101 * 16], m3
19193
19194; mode 4 [row 22]
19195movu m6, [r5 + 3 * 16]
19196pmaddubsw m3, m0, m6
19197pmulhrsw m3, m7
19198pmaddubsw m5, m2, m6
19199pmulhrsw m5, m7
19200packuswb m3, m5
19201movu [r0 + 172 * 16], m3
19202pmaddubsw m3, m1, m6
19203pmulhrsw m3, m7
19204pmaddubsw m5, m4, m6
19205pmulhrsw m5, m7
19206packuswb m3, m5
19207movu [r0 + 173 * 16], m3
19208
19209; mode 4 [row 23]
19210movu m6, [r5 + 24 * 16]
19211pmaddubsw m3, m0, m6
19212pmulhrsw m3, m7
19213pmaddubsw m5, m2, m6
19214pmulhrsw m5, m7
19215packuswb m3, m5
19216movu [r0 + 174 * 16], m3
19217pmaddubsw m3, m1, m6
19218pmulhrsw m3, m7
19219pmaddubsw m5, m4, m6
19220pmulhrsw m5, m7
19221packuswb m3, m5
19222movu [r0 + 175 * 16], m3
19223
19224; mode 5 [row 28]
19225movu m6, [r5 + 13 * 16]
19226pmaddubsw m3, m0, m6
19227pmulhrsw m3, m7
19228pmaddubsw m5, m2, m6
19229pmulhrsw m5, m7
19230packuswb m3, m5
19231movu [r0 + 248 * 16], m3
19232pmaddubsw m3, m1, m6
19233pmulhrsw m3, m7
19234pmaddubsw m5, m4, m6
19235pmulhrsw m5, m7
19236packuswb m3, m5
19237movu [r0 + 249 * 16], m3
19238
19239; mode 5 [row 29]
19240movu m6, [r5 + 30 * 16]
19241pmaddubsw m3, m0, m6
19242pmulhrsw m3, m7
19243pmaddubsw m5, m2, m6
19244pmulhrsw m5, m7
19245packuswb m3, m5
19246movu [r0 + 250 * 16], m3
19247pmaddubsw m3, m1, m6
19248pmulhrsw m3, m7
19249pmaddubsw m5, m4, m6
19250pmulhrsw m5, m7
19251packuswb m3, m5
19252movu [r0 + 251 * 16], m3
19253
19254; mode 3 [row 19]
19255movu m6, [r5 + 8 * 16]
19256movu m0, [r4 + 17]
19257movd m1, [r4 + 18]
19258palignr m1, m0, 1
19259punpcklbw m0, m1
19260pmaddubsw m1, m0, m6
19261pmulhrsw m1, m7
19262movu m2, [r4 + 25]
19263movd m3, [r4 + 26]
19264palignr m3, m2, 1
19265punpcklbw m2, m3
19266pmaddubsw m3, m2, m6
19267pmulhrsw m3, m7
19268packuswb m1, m3
19269movu [r0 + 102 * 16], m1
19270
19271movu m1, [r4 + 33]
19272movd m3, [r4 + 34]
19273palignr m3, m1, 1
19274punpcklbw m1, m3
19275pmaddubsw m3, m1, m6
19276pmulhrsw m3, m7
19277movu m4, [r4 + 41]
19278movd m5, [r4 + 42]
19279palignr m5, m4, 1
19280punpcklbw m4, m5
19281pmaddubsw m5, m4, m6
19282pmulhrsw m5, m7
19283packuswb m3, m5
19284movu [r0 + 103 * 16], m3
19285
19286; mode 4 [row 24]
19287movu m6, [r5 + 13 * 16]
19288pmaddubsw m3, m0, m6
19289pmulhrsw m3, m7
19290pmaddubsw m5, m2, m6
19291pmulhrsw m5, m7
19292packuswb m3, m5
19293movu [r0 + 176 * 16], m3
19294pmaddubsw m3, m1, m6
19295pmulhrsw m3, m7
19296pmaddubsw m5, m4, m6
19297pmulhrsw m5, m7
19298packuswb m3, m5
19299movu [r0 + 177 * 16], m3
19300
19301; mode 5 [row 30]
19302movu m6, [r5 + 15 * 16]
19303pmaddubsw m3, m0, m6
19304pmulhrsw m3, m7
19305pmaddubsw m5, m2, m6
19306pmulhrsw m5, m7
19307packuswb m3, m5
19308movu [r0 + 252 * 16], m3
19309pmaddubsw m3, m1, m6
19310pmulhrsw m3, m7
19311pmaddubsw m5, m4, m6
19312pmulhrsw m5, m7
19313packuswb m3, m5
19314movu [r0 + 253 * 16], m3
19315
19316; mode 3 [row 20]
19317movu m6, [r5 + 2 * 16]
19318movu m0, [r4 + 18]
19319movd m1, [r4 + 19]
19320palignr m1, m0, 1
19321punpcklbw m0, m1
19322pmaddubsw m1, m0, m6
19323pmulhrsw m1, m7
19324movu m2, [r4 + 26]
19325movd m3, [r4 + 27]
19326palignr m3, m2, 1
19327punpcklbw m2, m3
19328pmaddubsw m3, m2, m6
19329pmulhrsw m3, m7
19330packuswb m1, m3
19331movu [r0 + 104 * 16], m1
19332
19333movu m1, [r4 + 34]
19334movd m3, [r4 + 35]
19335palignr m3, m1, 1
19336punpcklbw m1, m3
19337pmaddubsw m3, m1, m6
19338pmulhrsw m3, m7
19339movu m4, [r4 + 42]
19340movd m5, [r4 + 43]
19341palignr m5, m4, 1
19342punpcklbw m4, m5
19343pmaddubsw m5, m4, m6
19344pmulhrsw m5, m7
19345packuswb m3, m5
19346movu [r0 + 105 * 16], m3
19347
19348; mode 4 [row 25]
19349pmaddubsw m3, m0, m6
19350pmulhrsw m3, m7
19351pmaddubsw m5, m2, m6
19352pmulhrsw m5, m7
19353packuswb m3, m5
19354movu [r0 + 178 * 16], m3
19355pmaddubsw m3, m1, m6
19356pmulhrsw m3, m7
19357pmaddubsw m5, m4, m6
19358pmulhrsw m5, m7
19359packuswb m3, m5
19360movu [r0 + 179 * 16], m3
19361
19362; mode 4 [row 26]
19363movu m6, [r5 + 23 * 16]
19364pmaddubsw m3, m0, m6
19365pmulhrsw m3, m7
19366pmaddubsw m5, m2, m6
19367pmulhrsw m5, m7
19368packuswb m3, m5
19369movu [r0 + 180 * 16], m3
19370pmaddubsw m3, m1, m6
19371pmulhrsw m3, m7
19372pmaddubsw m5, m4, m6
19373pmulhrsw m5, m7
19374packuswb m3, m5
19375movu [r0 + 181 * 16], m3
19376
19377; mode 3 [row 21]
19378movu m6, [r5 + 28 * 16]
19379pmaddubsw m3, m0, m6
19380pmulhrsw m3, m7
19381pmaddubsw m5, m2, m6
19382pmulhrsw m5, m7
19383packuswb m3, m5
19384movu [r0 + 106 * 16], m3
19385pmaddubsw m3, m1, m6
19386pmulhrsw m3, m7
19387pmaddubsw m5, m4, m6
19388pmulhrsw m5, m7
19389packuswb m3, m5
19390movu [r0 + 107 * 16], m3
19391
19392; mode 3 [row 22]
19393movu m6, [r5 + 22 * 16]
19394movu m0, [r4 + 19]
19395movd m1, [r4 + 20]
19396palignr m1, m0, 1
19397punpcklbw m0, m1
19398pmaddubsw m1, m0, m6
19399pmulhrsw m1, m7
19400movu m2, [r4 + 27]
19401movd m3, [r4 + 28]
19402palignr m3, m2, 1
19403punpcklbw m2, m3
19404pmaddubsw m3, m2, m6
19405pmulhrsw m3, m7
19406packuswb m1, m3
19407movu [r0 + 108 * 16], m1
19408
19409movu m1, [r4 + 35]
19410movd m3, [r4 + 36]
19411palignr m3, m1, 1
19412punpcklbw m1, m3
19413pmaddubsw m3, m1, m6
19414pmulhrsw m3, m7
19415movu m4, [r4 + 43]
19416movd m5, [r4 + 44]
19417palignr m5, m4, 1
19418punpcklbw m4, m5
19419pmaddubsw m5, m4, m6
19420pmulhrsw m5, m7
19421packuswb m3, m5
19422movu [r0 + 109 * 16], m3
19423
19424; mode 4 [row 27]
19425movu m6, [r5 + 12 * 16]
19426pmaddubsw m3, m0, m6
19427pmulhrsw m3, m7
19428pmaddubsw m5, m2, m6
19429pmulhrsw m5, m7
19430packuswb m3, m5
19431movu [r0 + 182 * 16], m3
19432pmaddubsw m3, m1, m6
19433pmulhrsw m3, m7
19434pmaddubsw m5, m4, m6
19435pmulhrsw m5, m7
19436packuswb m3, m5
19437movu [r0 + 183 * 16], m3
19438
19439; mode 3 [row 23]
19440movu m6, [r5 + 16 * 16]
19441movu m0, [r4 + 20]
19442movd m1, [r4 + 21]
19443palignr m1, m0, 1
19444punpcklbw m0, m1
19445pmaddubsw m1, m0, m6
19446pmulhrsw m1, m7
19447movu m2, [r4 + 28]
19448movd m3, [r4 + 29]
19449palignr m3, m2, 1
19450punpcklbw m2, m3
19451pmaddubsw m3, m2, m6
19452pmulhrsw m3, m7
19453packuswb m1, m3
19454movu [r0 + 110 * 16], m1
19455
19456movu m1, [r4 + 36]
19457movd m3, [r4 + 37]
19458palignr m3, m1, 1
19459punpcklbw m1, m3
19460pmaddubsw m3, m1, m6
19461pmulhrsw m3, m7
19462movu m4, [r4 + 44]
19463movd m5, [r4 + 45]
19464palignr m5, m4, 1
19465punpcklbw m4, m5
19466pmaddubsw m5, m4, m6
19467pmulhrsw m5, m7
19468packuswb m3, m5
19469movu [r0 + 111 * 16], m3
19470
19471; mode 4 [row 28]
19472movu m6, [r5 + 1 * 16]
19473pmaddubsw m3, m0, m6
19474pmulhrsw m3, m7
19475pmaddubsw m5, m2, m6
19476pmulhrsw m5, m7
19477packuswb m3, m5
19478movu [r0 + 184 * 16], m3
19479pmaddubsw m3, m1, m6
19480pmulhrsw m3, m7
19481pmaddubsw m5, m4, m6
19482pmulhrsw m5, m7
19483packuswb m3, m5
19484movu [r0 + 185 * 16], m3
19485
19486; mode 4 [row 29]
19487movu m6, [r5 + 22 * 16]
19488pmaddubsw m3, m0, m6
19489pmulhrsw m3, m7
19490pmaddubsw m5, m2, m6
19491pmulhrsw m5, m7
19492packuswb m3, m5
19493movu [r0 + 186 * 16], m3
19494pmaddubsw m3, m1, m6
19495pmulhrsw m3, m7
19496pmaddubsw m5, m4, m6
19497pmulhrsw m5, m7
19498packuswb m3, m5
19499movu [r0 + 187 * 16], m3
19500
19501; mode 3 [row 24]
19502movu m6, [r5 + 10 * 16]
19503movu m0, [r4 + 21]
19504movd m1, [r4 + 22]
19505palignr m1, m0, 1
19506punpcklbw m0, m1
19507pmaddubsw m1, m0, m6
19508pmulhrsw m1, m7
19509movu m2, [r4 + 29]
19510movd m3, [r4 + 30]
19511palignr m3, m2, 1
19512punpcklbw m2, m3
19513pmaddubsw m3, m2, m6
19514pmulhrsw m3, m7
19515packuswb m1, m3
19516movu [r0 + 112 * 16], m1
19517
19518movu m1, [r4 + 37]
19519movd m3, [r4 + 38]
19520palignr m3, m1, 1
19521punpcklbw m1, m3
19522pmaddubsw m3, m1, m6
19523pmulhrsw m3, m7
19524movu m4, [r4 + 45]
19525movd m5, [r4 + 46]
19526palignr m5, m4, 1
19527punpcklbw m4, m5
19528pmaddubsw m5, m4, m6
19529pmulhrsw m5, m7
19530packuswb m3, m5
19531movu [r0 + 113 * 16], m3
19532
19533; mode 4 [row 30]
19534movu m6, [r5 + 11 * 16]
19535pmaddubsw m3, m0, m6
19536pmulhrsw m3, m7
19537pmaddubsw m5, m2, m6
19538pmulhrsw m5, m7
19539packuswb m3, m5
19540movu [r0 + 188 * 16], m3
19541pmaddubsw m3, m1, m6
19542pmulhrsw m3, m7
19543pmaddubsw m5, m4, m6
19544pmulhrsw m5, m7
19545packuswb m3, m5
19546movu [r0 + 189 * 16], m3
19547
19548; mode 3 [row 25]
19549movu m6, [r5 + 4 * 16]
19550movu m0, [r4 + 22]
19551movd m1, [r4 + 23]
19552palignr m1, m0, 1
19553punpcklbw m0, m1
19554pmaddubsw m1, m0, m6
19555pmulhrsw m1, m7
19556movu m2, [r4 + 30]
19557movd m3, [r4 + 31]
19558palignr m3, m2, 1
19559punpcklbw m2, m3
19560pmaddubsw m3, m2, m6
19561pmulhrsw m3, m7
19562packuswb m1, m3
19563movu [r0 + 114 * 16], m1
19564
19565movu m1, [r4 + 38]
19566movd m3, [r4 + 39]
19567palignr m3, m1, 1
19568punpcklbw m1, m3
19569pmaddubsw m3, m1, m6
19570pmulhrsw m3, m7
19571movu m4, [r4 + 46]
19572movd m5, [r4 + 47]
19573palignr m5, m4, 1
19574punpcklbw m4, m5
19575pmaddubsw m5, m4, m6
19576pmulhrsw m5, m7
19577packuswb m3, m5
19578movu [r0 + 115 * 16], m3
19579
19580; mode 3 [row 26]
19581movu m6, [r5 + 30 * 16]
19582pmaddubsw m3, m0, m6
19583pmulhrsw m3, m7
19584pmaddubsw m5, m2, m6
19585pmulhrsw m5, m7
19586packuswb m3, m5
19587movu [r0 + 116 * 16], m3
19588pmaddubsw m3, m1, m6
19589pmulhrsw m3, m7
19590pmaddubsw m5, m4, m6
19591pmulhrsw m5, m7
19592packuswb m3, m5
19593movu [r0 + 117 * 16], m3
19594
19595; mode 3 [row 27]
19596movu m6, [r5 + 24 * 16]
19597movu m0, [r4 + 23]
19598movd m1, [r4 + 24]
19599palignr m1, m0, 1
19600punpcklbw m0, m1
19601pmaddubsw m1, m0, m6
19602pmulhrsw m1, m7
19603movu m2, [r4 + 31]
19604movd m3, [r4 + 32]
19605palignr m3, m2, 1
19606punpcklbw m2, m3
19607pmaddubsw m3, m2, m6
19608pmulhrsw m3, m7
19609packuswb m1, m3
19610movu [r0 + 118 * 16], m1
19611
19612movu m1, [r4 + 39]
19613movd m3, [r4 + 40]
19614palignr m3, m1, 1
19615punpcklbw m1, m3
19616pmaddubsw m3, m1, m6
19617pmulhrsw m3, m7
19618movu m4, [r4 + 47]
19619movd m5, [r4 + 48]
19620palignr m5, m4, 1
19621punpcklbw m4, m5
19622pmaddubsw m5, m4, m6
19623pmulhrsw m5, m7
19624packuswb m3, m5
19625movu [r0 + 119 * 16], m3
19626
19627; mode 3 [row 28]
19628movu m6, [r5 + 18 * 16]
19629movu m0, [r4 + 24]
19630movd m1, [r4 + 25]
19631palignr m1, m0, 1
19632punpcklbw m0, m1
19633pmaddubsw m1, m0, m6
19634pmulhrsw m1, m7
19635movu m2, [r4 + 32]
19636movd m3, [r4 + 33]
19637palignr m3, m2, 1
19638punpcklbw m2, m3
19639pmaddubsw m3, m2, m6
19640pmulhrsw m3, m7
19641packuswb m1, m3
19642movu [r0 + 120 * 16], m1
19643
19644movu m1, [r4 + 40]
19645movd m3, [r4 + 41]
19646palignr m3, m1, 1
19647punpcklbw m1, m3
19648pmaddubsw m3, m1, m6
19649pmulhrsw m3, m7
19650movu m4, [r4 + 48]
19651movd m5, [r4 + 49]
19652palignr m5, m4, 1
19653punpcklbw m4, m5
19654pmaddubsw m5, m4, m6
19655pmulhrsw m5, m7
19656packuswb m3, m5
19657movu [r0 + 121 * 16], m3
19658
19659; mode 3 [row 29]
19660movu m6, [r5 + 12 * 16]
19661movu m0, [r4 + 25]
19662movd m1, [r4 + 26]
19663palignr m1, m0, 1
19664punpcklbw m0, m1
19665pmaddubsw m1, m0, m6
19666pmulhrsw m1, m7
19667movu m2, [r4 + 33]
19668movd m3, [r4 + 34]
19669palignr m3, m2, 1
19670punpcklbw m2, m3
19671pmaddubsw m3, m2, m6
19672pmulhrsw m3, m7
19673packuswb m1, m3
19674movu [r0 + 122 * 16], m1
19675
19676movu m1, [r4 + 41]
19677movd m3, [r4 + 42]
19678palignr m3, m1, 1
19679punpcklbw m1, m3
19680pmaddubsw m3, m1, m6
19681pmulhrsw m3, m7
19682movu m4, [r4 + 49]
19683movd m5, [r4 + 50]
19684palignr m5, m4, 1
19685punpcklbw m4, m5
19686pmaddubsw m5, m4, m6
19687pmulhrsw m5, m7
19688packuswb m3, m5
19689movu [r0 + 123 * 16], m3
19690
19691; mode 3 [row 30]
19692movu m6, [r5 + 6 * 16]
19693movu m0, [r4 + 26]
19694movd m1, [r4 + 27]
19695palignr m1, m0, 1
19696punpcklbw m0, m1
19697pmaddubsw m1, m0, m6
19698pmulhrsw m1, m7
19699movu m2, [r4 + 34]
19700movd m3, [r4 + 35]
19701palignr m3, m2, 1
19702punpcklbw m2, m3
19703pmaddubsw m3, m2, m6
19704pmulhrsw m3, m7
19705packuswb m1, m3
19706movu [r0 + 124 * 16], m1
19707
19708movu m1, [r4 + 42]
19709movd m3, [r4 + 43]
19710palignr m3, m1, 1
19711punpcklbw m1, m3
19712pmaddubsw m3, m1, m6
19713pmulhrsw m3, m7
19714movu m4, [r4 + 50]
19715movd m5, [r4 + 51]
19716palignr m5, m4, 1
19717punpcklbw m4, m5
19718pmaddubsw m5, m4, m6
19719pmulhrsw m5, m7
19720packuswb m3, m5
19721movu [r0 + 125 * 16], m3
19722
19723; mode 10
19724movu m1, [r2 + 1]
19725movu m2, [r2 + 17]
19726movu [r0 + 512 * 16], m1
19727movu [r0 + 513 * 16], m2
19728movu [r0 + 514 * 16], m1
19729movu [r0 + 515 * 16], m2
19730movu [r0 + 516 * 16], m1
19731movu [r0 + 517 * 16], m2
19732movu [r0 + 518 * 16], m1
19733movu [r0 + 519 * 16], m2
19734movu [r0 + 520 * 16], m1
19735movu [r0 + 521 * 16], m2
19736movu [r0 + 522 * 16], m1
19737movu [r0 + 523 * 16], m2
19738movu [r0 + 524 * 16], m1
19739movu [r0 + 525 * 16], m2
19740movu [r0 + 526 * 16], m1
19741movu [r0 + 527 * 16], m2
19742
19743movu [r0 + 528 * 16], m1
19744movu [r0 + 529 * 16], m2
19745movu [r0 + 530 * 16], m1
19746movu [r0 + 531 * 16], m2
19747movu [r0 + 532 * 16], m1
19748movu [r0 + 533 * 16], m2
19749movu [r0 + 534 * 16], m1
19750movu [r0 + 535 * 16], m2
19751movu [r0 + 536 * 16], m1
19752movu [r0 + 537 * 16], m2
19753movu [r0 + 538 * 16], m1
19754movu [r0 + 539 * 16], m2
19755movu [r0 + 540 * 16], m1
19756movu [r0 + 541 * 16], m2
19757movu [r0 + 542 * 16], m1
19758movu [r0 + 543 * 16], m2
19759
19760movu [r0 + 544 * 16], m1
19761movu [r0 + 545 * 16], m2
19762movu [r0 + 546 * 16], m1
19763movu [r0 + 547 * 16], m2
19764movu [r0 + 548 * 16], m1
19765movu [r0 + 549 * 16], m2
19766movu [r0 + 550 * 16], m1
19767movu [r0 + 551 * 16], m2
19768movu [r0 + 552 * 16], m1
19769movu [r0 + 553 * 16], m2
19770movu [r0 + 554 * 16], m1
19771movu [r0 + 555 * 16], m2
19772movu [r0 + 556 * 16], m1
19773movu [r0 + 557 * 16], m2
19774movu [r0 + 558 * 16], m1
19775movu [r0 + 559 * 16], m2
19776
19777movu [r0 + 560 * 16], m1
19778movu [r0 + 561 * 16], m2
19779movu [r0 + 562 * 16], m1
19780movu [r0 + 563 * 16], m2
19781movu [r0 + 564 * 16], m1
19782movu [r0 + 565 * 16], m2
19783movu [r0 + 566 * 16], m1
19784movu [r0 + 567 * 16], m2
19785movu [r0 + 568 * 16], m1
19786movu [r0 + 569 * 16], m2
19787movu [r0 + 570 * 16], m1
19788movu [r0 + 571 * 16], m2
19789movu [r0 + 572 * 16], m1
19790movu [r0 + 573 * 16], m2
19791movu [r0 + 574 * 16], m1
19792movu [r0 + 575 * 16], m2
19793
19794; mode 11 [row 0]
19795movu m0, [r4]
19796
19797; mode 11 [row 15 - first half]
19798movu [r0 + 606 * 16], m0
19799
19800movu [r0 + 606 * 16], m0
19801
19802; mode 12 [row 31]
19803pslldq m6, m0, 4
19804pinsrb m6, [r3 + 26], 0
19805pinsrb m6, [r3 + 19], 1
19806pinsrb m6, [r3 + 13], 2
19807pinsrb m6, [r3 + 6], 3
19808movu [r0 + 702 * 16], m6
19809movu m6, [r4 + 12]
19810movu [r0 + 703 * 16], m6
19811
19812; mode 11 [row 31]
19813pslldq m6, m0, 1
19814pinsrb m6, [r3 + 16], 0
19815movu [r0 + 638 * 16], m6
19816movu m6, [r4 + 15]
19817movu [r0 + 639 * 16], m6
19818
19819movd m1, [r4 + 1]
19820palignr m1, m0, 1
19821punpcklbw m0, m1
19822pmaddubsw m1, m0, [r5 + 30 * 16]
19823pmulhrsw m1, m7
19824movu m2, [r4 + 8]
19825movd m3, [r4 + 9]
19826palignr m3, m2, 1
19827punpcklbw m2, m3
19828pmaddubsw m3, m2, [r5 + 30 * 16]
19829pmulhrsw m3, m7
19830packuswb m1, m3
19831movu [r0 + 576 * 16], m1
19832
19833movu m1, [r4 + 16]
19834
19835; mode 11 [row 15 - second half]
19836movu [r0 + 607 * 16], m1
19837
19838movd m3, [r4 + 17]
19839palignr m3, m1, 1
19840punpcklbw m1, m3
19841pmaddubsw m3, m1, [r5 + 30 * 16]
19842pmulhrsw m3, m7
19843movu m4, [r4 + 24]
19844movd m5, [r4 + 25]
19845palignr m5, m4, 1
19846punpcklbw m4, m5
19847pmaddubsw m5, m4, [r5 + 30 * 16]
19848pmulhrsw m5, m7
19849packuswb m3, m5
19850movu [r0 + 577 * 16], m3
19851
19852; mode 11 [row 1]
19853pmaddubsw m3, m0, [r5 + 28 * 16]
19854pmulhrsw m3, m7
19855pmaddubsw m5, m2, [r5 + 28 * 16]
19856pmulhrsw m5, m7
19857packuswb m3, m5
19858movu [r0 + 578 * 16], m3
19859pmaddubsw m3, m1, [r5 + 28 * 16]
19860pmulhrsw m3, m7
19861pmaddubsw m5, m4, [r5 + 28 * 16]
19862pmulhrsw m5, m7
19863packuswb m3, m5
19864movu [r0 + 579 * 16], m3
19865
19866; mode 11 [row 2]
19867pmaddubsw m3, m0, [r5 + 26 * 16]
19868pmulhrsw m3, m7
19869pmaddubsw m5, m2, [r5 + 26 * 16]
19870pmulhrsw m5, m7
19871packuswb m3, m5
19872movu [r0 + 580 * 16], m3
19873pmaddubsw m3, m1, [r5 + 26 * 16]
19874pmulhrsw m3, m7
19875pmaddubsw m5, m4, [r5 + 26 * 16]
19876pmulhrsw m5, m7
19877packuswb m3, m5
19878movu [r0 + 581 * 16], m3
19879
19880; mode 11 [row 3]
19881pmaddubsw m3, m0, [r5 + 24 * 16]
19882pmulhrsw m3, m7
19883pmaddubsw m5, m2, [r5 + 24 * 16]
19884pmulhrsw m5, m7
19885packuswb m3, m5
19886movu [r0 + 582 * 16], m3
19887pmaddubsw m3, m1, [r5 + 24 * 16]
19888pmulhrsw m3, m7
19889pmaddubsw m5, m4, [r5 + 24 * 16]
19890pmulhrsw m5, m7
19891packuswb m3, m5
19892movu [r0 + 583 * 16], m3
19893
19894; mode 11 [row 4]
19895pmaddubsw m3, m0, [r5 + 22 * 16]
19896pmulhrsw m3, m7
19897pmaddubsw m5, m2, [r5 + 22 * 16]
19898pmulhrsw m5, m7
19899packuswb m3, m5
19900movu [r0 + 584 * 16], m3
19901
19902; mode 12 [row 1 - first half]
19903movu [r0 + 642 * 16], m3
19904
19905pmaddubsw m3, m1, [r5 + 22 * 16]
19906pmulhrsw m3, m7
19907pmaddubsw m5, m4, [r5 + 22 * 16]
19908pmulhrsw m5, m7
19909packuswb m3, m5
19910movu [r0 + 585 * 16], m3
19911
19912; mode 12 [row 1 - second half]
19913movu [r0 + 643 * 16], m3
19914
19915; mode 11 [row 5]
19916pmaddubsw m3, m0, [r5 + 20 * 16]
19917pmulhrsw m3, m7
19918pmaddubsw m5, m2, [r5 + 20 * 16]
19919pmulhrsw m5, m7
19920packuswb m3, m5
19921movu [r0 + 586 * 16], m3
19922pmaddubsw m3, m1, [r5 + 20 * 16]
19923pmulhrsw m3, m7
19924pmaddubsw m5, m4, [r5 + 20 * 16]
19925pmulhrsw m5, m7
19926packuswb m3, m5
19927movu [r0 + 587 * 16], m3
19928
19929; mode 11 [row 6]
19930pmaddubsw m3, m0, [r5 + 18 * 16]
19931pmulhrsw m3, m7
19932pmaddubsw m5, m2, [r5 + 18 * 16]
19933pmulhrsw m5, m7
19934packuswb m3, m5
19935movu [r0 + 588 * 16], m3
19936pmaddubsw m3, m1, [r5 + 18 * 16]
19937pmulhrsw m3, m7
19938pmaddubsw m5, m4, [r5 + 18 * 16]
19939pmulhrsw m5, m7
19940packuswb m3, m5
19941movu [r0 + 589 * 16], m3
19942
19943; mode 11 [row 7]
19944pmaddubsw m3, m0, [r5 + 16 * 16]
19945pmulhrsw m3, m7
19946pmaddubsw m5, m2, [r5 + 16 * 16]
19947pmulhrsw m5, m7
19948packuswb m3, m5
19949movu [r0 + 590 * 16], m3
19950pmaddubsw m3, m1, [r5 + 16 * 16]
19951pmulhrsw m3, m7
19952pmaddubsw m5, m4, [r5 + 16 * 16]
19953pmulhrsw m5, m7
19954packuswb m3, m5
19955movu [r0 + 591 * 16], m3
19956
19957; mode 11 [row 8]
19958pmaddubsw m3, m0, [r5 + 14 * 16]
19959pmulhrsw m3, m7
19960pmaddubsw m5, m2, [r5 + 14 * 16]
19961pmulhrsw m5, m7
19962packuswb m3, m5
19963movu [r0 + 592 * 16], m3
19964
19965; mode 13 [row 1 - first half]
19966movu [r0 + 706 * 16], m3
19967
19968pmaddubsw m3, m1, [r5 + 14 * 16]
19969pmulhrsw m3, m7
19970pmaddubsw m5, m4, [r5 + 14 * 16]
19971pmulhrsw m5, m7
19972packuswb m3, m5
19973movu [r0 + 593 * 16], m3
19974
19975; mode 13 [row 1 - second half]
19976movu [r0 + 707 * 16], m3
19977
19978; mode 11 [row 9]
19979pmaddubsw m3, m0, [r5 + 12 * 16]
19980pmulhrsw m3, m7
19981pmaddubsw m5, m2, [r5 + 12 * 16]
19982pmulhrsw m5, m7
19983packuswb m3, m5
19984movu [r0 + 594 * 16], m3
19985
19986; mode 12 [row 3 - first half]
19987movu [r0 + 646 * 16], m3
19988
19989pmaddubsw m3, m1, [r5 + 12 * 16]
19990pmulhrsw m3, m7
19991pmaddubsw m5, m4, [r5 + 12 * 16]
19992pmulhrsw m5, m7
19993packuswb m3, m5
19994movu [r0 + 595 * 16], m3
19995
19996; mode 12 [row 3 - second half]
19997movu [r0 + 647 * 16], m3
19998
19999; mode 11 [row 10]
20000pmaddubsw m3, m0, [r5 + 10 * 16]
20001pmulhrsw m3, m7
20002pmaddubsw m5, m2, [r5 + 10 * 16]
20003pmulhrsw m5, m7
20004packuswb m3, m5
20005movu [r0 + 596 * 16], m3
20006pmaddubsw m3, m1, [r5 + 10 * 16]
20007pmulhrsw m3, m7
20008pmaddubsw m5, m4, [r5 + 10 * 16]
20009pmulhrsw m5, m7
20010packuswb m3, m5
20011movu [r0 + 597 * 16], m3
20012
20013; mode 11 [row 11]
20014pmaddubsw m3, m0, [r5 + 8 * 16]
20015pmulhrsw m3, m7
20016pmaddubsw m5, m2, [r5 + 8 * 16]
20017pmulhrsw m5, m7
20018packuswb m3, m5
20019movu [r0 + 598 * 16], m3
20020pmaddubsw m3, m1, [r5 + 8 * 16]
20021pmulhrsw m3, m7
20022pmaddubsw m5, m4, [r5 + 8 * 16]
20023pmulhrsw m5, m7
20024packuswb m3, m5
20025movu [r0 + 599 * 16], m3
20026
20027; mode 11 [row 12]
20028pmaddubsw m3, m0, [r5 + 6 * 16]
20029pmulhrsw m3, m7
20030pmaddubsw m5, m2, [r5 + 6 * 16]
20031pmulhrsw m5, m7
20032packuswb m3, m5
20033movu [r0 + 600 * 16], m3
20034
20035; mode 14 [row 1 - first half]
20036movu [r0 + 770 * 16], m3
20037
20038pmaddubsw m3, m1, [r5 + 6 * 16]
20039pmulhrsw m3, m7
20040pmaddubsw m5, m4, [r5 + 6 * 16]
20041pmulhrsw m5, m7
20042packuswb m3, m5
20043movu [r0 + 601 * 16], m3
20044
20045; mode 14 [row 1 - second half]
20046movu [r0 + 771 * 16], m3
20047
20048; mode 11 [row 13]
20049pmaddubsw m3, m0, [r5 + 4 * 16]
20050pmulhrsw m3, m7
20051pmaddubsw m5, m2, [r5 + 4 * 16]
20052pmulhrsw m5, m7
20053packuswb m3, m5
20054movu [r0 + 602 * 16], m3
20055pmaddubsw m3, m1, [r5 + 4 * 16]
20056pmulhrsw m3, m7
20057pmaddubsw m5, m4, [r5 + 4 * 16]
20058pmulhrsw m5, m7
20059packuswb m3, m5
20060movu [r0 + 603 * 16], m3
20061
20062; mode 11 [row 14]
20063pmaddubsw m3, m0, [r5 + 2 * 16]
20064pmulhrsw m3, m7
20065pmaddubsw m5, m2, [r5 + 2 * 16]
20066pmulhrsw m5, m7
20067packuswb m3, m5
20068movu [r0 + 604 * 16], m3
20069
20070; mode 13 [row 5 - first half]
20071movu [r0 + 650 * 16], m3
20072
20073pmaddubsw m3, m1, [r5 + 2 * 16]
20074pmulhrsw m3, m7
20075pmaddubsw m5, m4, [r5 + 2 * 16]
20076pmulhrsw m5, m7
20077packuswb m3, m5
20078movu [r0 + 605 * 16], m3
20079
20080; mode 13 [row 5 - second half]
20081movu [r0 + 651 * 16], m3
20082
20083; mode 12 [row 0]
20084pmaddubsw m3, m0, [r5 + 27 * 16]
20085pmulhrsw m3, m7
20086pmaddubsw m5, m2, [r5 + 27 * 16]
20087pmulhrsw m5, m7
20088packuswb m3, m5
20089movu [r0 + 640 * 16], m3
20090pmaddubsw m3, m1, [r5 + 27 * 16]
20091pmulhrsw m3, m7
20092pmaddubsw m5, m4, [r5 + 27 * 16]
20093pmulhrsw m5, m7
20094packuswb m3, m5
20095movu [r0 + 641 * 16], m3
20096
20097; mode 12 [row 2]
20098pmaddubsw m3, m0, [r5 + 17 * 16]
20099pmulhrsw m3, m7
20100pmaddubsw m5, m2, [r5 + 17 * 16]
20101pmulhrsw m5, m7
20102packuswb m3, m5
20103movu [r0 + 644 * 16], m3
20104pmaddubsw m3, m1, [r5 + 17 * 16]
20105pmulhrsw m3, m7
20106pmaddubsw m5, m4, [r5 + 17 * 16]
20107pmulhrsw m5, m7
20108packuswb m3, m5
20109movu [r0 + 645 * 16], m3
20110
20111; mode 12 [row 4]
20112pmaddubsw m3, m0, [r5 + 7 * 16]
20113pmulhrsw m3, m7
20114pmaddubsw m5, m2, [r5 + 7 * 16]
20115pmulhrsw m5, m7
20116packuswb m3, m5
20117movu [r0 + 648 * 16], m3
20118pmaddubsw m3, m1, [r5 + 7 * 16]
20119pmulhrsw m3, m7
20120pmaddubsw m5, m4, [r5 + 7 * 16]
20121pmulhrsw m5, m7
20122packuswb m3, m5
20123movu [r0 + 649 * 16], m3
20124
20125; mode 13 [row 0]
20126pmaddubsw m3, m0, [r5 + 23 * 16]
20127pmulhrsw m3, m7
20128pmaddubsw m5, m2, [r5 + 23 * 16]
20129pmulhrsw m5, m7
20130packuswb m3, m5
20131movu [r0 + 704 * 16], m3
20132pmaddubsw m3, m1, [r5 + 23 * 16]
20133pmulhrsw m3, m7
20134pmaddubsw m5, m4, [r5 + 23 * 16]
20135pmulhrsw m5, m7
20136packuswb m3, m5
20137movu [r0 + 705 * 16], m3
20138
20139; mode 13 [row 2]
20140pmaddubsw m3, m0, [r5 + 5 * 16]
20141pmulhrsw m3, m7
20142pmaddubsw m5, m2, [r5 + 5 * 16]
20143pmulhrsw m5, m7
20144packuswb m3, m5
20145movu [r0 + 708 * 16], m3
20146pmaddubsw m3, m1, [r5 + 5 * 16]
20147pmulhrsw m3, m7
20148pmaddubsw m5, m4, [r5 + 5 * 16]
20149pmulhrsw m5, m7
20150packuswb m3, m5
20151movu [r0 + 709 * 16], m3
20152
20153; mode 14 [row 0]
20154pmaddubsw m3, m0, [r5 + 19 * 16]
20155pmulhrsw m3, m7
20156pmaddubsw m5, m2, [r5 + 19 * 16]
20157pmulhrsw m5, m7
20158packuswb m3, m5
20159movu [r0 + 768 * 16], m3
20160pmaddubsw m3, m1, [r5 + 19 * 16]
20161pmulhrsw m3, m7
20162pmaddubsw m5, m4, [r5 + 19 * 16]
20163pmulhrsw m5, m7
20164packuswb m3, m5
20165movu [r0 + 769 * 16], m3
20166
20167; mode 15 [row 0]
20168pmaddubsw m3, m0, [r5 + 15 * 16]
20169pmulhrsw m3, m7
20170pmaddubsw m5, m2, [r5 + 15 * 16]
20171pmulhrsw m5, m7
20172packuswb m3, m5
20173movu [r0 + 832 * 16], m3
20174pmaddubsw m3, m1, [r5 + 15 * 16]
20175pmulhrsw m3, m7
20176pmaddubsw m5, m4, [r5 + 15 * 16]
20177pmulhrsw m5, m7
20178packuswb m3, m5
20179movu [r0 + 833 * 16], m3
20180
20181; mode 11 [row 16]
20182pslldq m0, 2
20183pinsrb m0, [r4 + 0], 1
20184pinsrb m0, [r3 + 16], 0
20185pmaddubsw m3, m0, [r5 + 30 * 16]
20186pmulhrsw m3, m7
20187pslldq m2, 2
20188pinsrb m2, [r4 + 8], 1
20189pinsrb m2, [r4 + 7], 0
20190pmaddubsw m5, m2, [r5 + 30 * 16]
20191pmulhrsw m5, m7
20192packuswb m3, m5
20193movu [r0 + 608 * 16], m3
20194pslldq m1, 2
20195pinsrb m1, [r4 + 16], 1
20196pinsrb m1, [r4 + 15], 0
20197pmaddubsw m3, m1, [r5 + 30 * 16]
20198pmulhrsw m3, m7
20199pslldq m4, 2
20200pinsrb m4, [r4 + 24], 1
20201pinsrb m4, [r4 + 23], 0
20202pmaddubsw m5, m4, [r5 + 30 * 16]
20203pmulhrsw m5, m7
20204packuswb m3, m5
20205movu [r0 + 609 * 16], m3
20206
20207; mode 11 [row 17]
20208pmaddubsw m3, m0, [r5 + 28 * 16]
20209pmulhrsw m3, m7
20210pmaddubsw m5, m2, [r5 + 28 * 16]
20211pmulhrsw m5, m7
20212packuswb m3, m5
20213movu [r0 + 610 * 16], m3
20214pmaddubsw m3, m1, [r5 + 28 * 16]
20215pmulhrsw m3, m7
20216pmaddubsw m5, m4, [r5 + 28 * 16]
20217pmulhrsw m5, m7
20218packuswb m3, m5
20219movu [r0 + 611 * 16], m3
20220
20221; mode 11 [row 18]
20222pmaddubsw m3, m0, [r5 + 26 * 16]
20223pmulhrsw m3, m7
20224pmaddubsw m5, m2, [r5 + 26 * 16]
20225pmulhrsw m5, m7
20226packuswb m3, m5
20227movu [r0 + 612 * 16], m3
20228pmaddubsw m3, m1, [r5 + 26 * 16]
20229pmulhrsw m3, m7
20230pmaddubsw m5, m4, [r5 + 26 * 16]
20231pmulhrsw m5, m7
20232packuswb m3, m5
20233movu [r0 + 613 * 16], m3
20234
20235; mode 11 [row 19]
20236pmaddubsw m3, m0, [r5 + 24 * 16]
20237pmulhrsw m3, m7
20238pmaddubsw m5, m2, [r5 + 24 * 16]
20239pmulhrsw m5, m7
20240packuswb m3, m5
20241movu [r0 + 614 * 16], m3
20242pmaddubsw m3, m1, [r5 + 24 * 16]
20243pmulhrsw m3, m7
20244pmaddubsw m5, m4, [r5 + 24 * 16]
20245pmulhrsw m5, m7
20246packuswb m3, m5
20247movu [r0 + 615 * 16], m3
20248
20249; mode 11 [row 20]
20250pmaddubsw m3, m0, [r5 + 22 * 16]
20251pmulhrsw m3, m7
20252pmaddubsw m5, m2, [r5 + 22 * 16]
20253pmulhrsw m5, m7
20254packuswb m3, m5
20255movu [r0 + 616 * 16], m3
20256pmaddubsw m3, m1, [r5 + 22 * 16]
20257pmulhrsw m3, m7
20258pmaddubsw m5, m4, [r5 + 22 * 16]
20259pmulhrsw m5, m7
20260packuswb m3, m5
20261movu [r0 + 617 * 16], m3
20262
20263; mode 11 [row 21]
20264pmaddubsw m3, m0, [r5 + 20 * 16]
20265pmulhrsw m3, m7
20266pmaddubsw m5, m2, [r5 + 20 * 16]
20267pmulhrsw m5, m7
20268packuswb m3, m5
20269movu [r0 + 618 * 16], m3
20270pmaddubsw m3, m1, [r5 + 20 * 16]
20271pmulhrsw m3, m7
20272pmaddubsw m5, m4, [r5 + 20 * 16]
20273pmulhrsw m5, m7
20274packuswb m3, m5
20275movu [r0 + 619 * 16], m3
20276
20277; mode 11 [row 22]
20278pmaddubsw m3, m0, [r5 + 18 * 16]
20279pmulhrsw m3, m7
20280pmaddubsw m5, m2, [r5 + 18 * 16]
20281pmulhrsw m5, m7
20282packuswb m3, m5
20283movu [r0 + 620 * 16], m3
20284pmaddubsw m3, m1, [r5 + 18 * 16]
20285pmulhrsw m3, m7
20286pmaddubsw m5, m4, [r5 + 18 * 16]
20287pmulhrsw m5, m7
20288packuswb m3, m5
20289movu [r0 + 621 * 16], m3
20290
20291; mode 11 [row 23]
20292pmaddubsw m3, m0, [r5 + 16 * 16]
20293pmulhrsw m3, m7
20294pmaddubsw m5, m2, [r5 + 16 * 16]
20295pmulhrsw m5, m7
20296packuswb m3, m5
20297movu [r0 + 622 * 16], m3
20298pmaddubsw m3, m1, [r5 + 16 * 16]
20299pmulhrsw m3, m7
20300pmaddubsw m5, m4, [r5 + 16 * 16]
20301pmulhrsw m5, m7
20302packuswb m3, m5
20303movu [r0 + 623 * 16], m3
20304
20305; mode 11 [row 24]
20306pmaddubsw m3, m0, [r5 + 14 * 16]
20307pmulhrsw m3, m7
20308pmaddubsw m5, m2, [r5 + 14 * 16]
20309pmulhrsw m5, m7
20310packuswb m3, m5
20311movu [r0 + 624 * 16], m3
20312pmaddubsw m3, m1, [r5 + 14 * 16]
20313pmulhrsw m3, m7
20314pmaddubsw m5, m4, [r5 + 14 * 16]
20315pmulhrsw m5, m7
20316packuswb m3, m5
20317movu [r0 + 625 * 16], m3
20318
20319; mode 11 [row 25]
20320pmaddubsw m3, m0, [r5 + 12 * 16]
20321pmulhrsw m3, m7
20322pmaddubsw m5, m2, [r5 + 12 * 16]
20323pmulhrsw m5, m7
20324packuswb m3, m5
20325movu [r0 + 626 * 16], m3
20326pmaddubsw m3, m1, [r5 + 12 * 16]
20327pmulhrsw m3, m7
20328pmaddubsw m5, m4, [r5 + 12 * 16]
20329pmulhrsw m5, m7
20330packuswb m3, m5
20331movu [r0 + 627 * 16], m3
20332
20333; mode 11 [row 26]
20334pmaddubsw m3, m0, [r5 + 10 * 16]
20335pmulhrsw m3, m7
20336pmaddubsw m5, m2, [r5 + 10 * 16]
20337pmulhrsw m5, m7
20338packuswb m3, m5
20339movu [r0 + 628 * 16], m3
20340pmaddubsw m3, m1, [r5 + 10 * 16]
20341pmulhrsw m3, m7
20342pmaddubsw m5, m4, [r5 + 10 * 16]
20343pmulhrsw m5, m7
20344packuswb m3, m5
20345movu [r0 + 629 * 16], m3
20346
20347; mode 11 [row 27]
20348pmaddubsw m3, m0, [r5 + 8 * 16]
20349pmulhrsw m3, m7
20350pmaddubsw m5, m2, [r5 + 8 * 16]
20351pmulhrsw m5, m7
20352packuswb m3, m5
20353movu [r0 + 630 * 16], m3
20354pmaddubsw m3, m1, [r5 + 8 * 16]
20355pmulhrsw m3, m7
20356pmaddubsw m5, m4, [r5 + 8 * 16]
20357pmulhrsw m5, m7
20358packuswb m3, m5
20359movu [r0 + 631 * 16], m3
20360
20361; mode 11 [row 28]
20362pmaddubsw m3, m0, [r5 + 6 * 16]
20363pmulhrsw m3, m7
20364pmaddubsw m5, m2, [r5 + 6 * 16]
20365pmulhrsw m5, m7
20366packuswb m3, m5
20367movu [r0 + 632 * 16], m3
20368pmaddubsw m3, m1, [r5 + 6 * 16]
20369pmulhrsw m3, m7
20370pmaddubsw m5, m4, [r5 + 6 * 16]
20371pmulhrsw m5, m7
20372packuswb m3, m5
20373movu [r0 + 633 * 16], m3
20374
20375; mode 11 [row 29]
20376pmaddubsw m3, m0, [r5 + 4 * 16]
20377pmulhrsw m3, m7
20378pmaddubsw m5, m2, [r5 + 4 * 16]
20379pmulhrsw m5, m7
20380packuswb m3, m5
20381movu [r0 + 634 * 16], m3
20382pmaddubsw m3, m1, [r5 + 4 * 16]
20383pmulhrsw m3, m7
20384pmaddubsw m5, m4, [r5 + 4 * 16]
20385pmulhrsw m5, m7
20386packuswb m3, m5
20387movu [r0 + 635 * 16], m3
20388
20389; mode 11 [row 30]
20390pmaddubsw m3, m0, [r5 + 2 * 16]
20391pmulhrsw m3, m7
20392pmaddubsw m5, m2, [r5 + 2 * 16]
20393pmulhrsw m5, m7
20394packuswb m3, m5
20395movu [r0 + 636 * 16], m3
20396pmaddubsw m3, m1, [r5 + 2 * 16]
20397pmulhrsw m3, m7
20398pmaddubsw m5, m4, [r5 + 2 * 16]
20399pmulhrsw m5, m7
20400packuswb m3, m5
20401movu [r0 + 637 * 16], m3
20402
20403; mode 12 [row 6]
20404pinsrb m0, [r3 + 6], 0
20405pmaddubsw m3, m0, [r5 + 29 * 16]
20406pmulhrsw m3, m7
20407pmaddubsw m5, m2, [r5 + 29 * 16]
20408pmulhrsw m5, m7
20409packuswb m3, m5
20410movu [r0 + 652 * 16], m3
20411pmaddubsw m3, m1, [r5 + 29 * 16]
20412pmulhrsw m3, m7
20413pmaddubsw m5, m4, [r5 + 29 * 16]
20414pmulhrsw m5, m7
20415packuswb m3, m5
20416movu [r0 + 653 * 16], m3
20417
20418; mode 12 [row 7]
20419pmaddubsw m3, m0, [r5 + 24 * 16]
20420pmulhrsw m3, m7
20421pmaddubsw m5, m2, [r5 + 24 * 16]
20422pmulhrsw m5, m7
20423packuswb m3, m5
20424movu [r0 + 654 * 16], m3
20425pmaddubsw m3, m1, [r5 + 24 * 16]
20426pmulhrsw m3, m7
20427pmaddubsw m5, m4, [r5 + 24 * 16]
20428pmulhrsw m5, m7
20429packuswb m3, m5
20430movu [r0 + 655 * 16], m3
20431
20432; mode 12 [row 8]
20433pmaddubsw m3, m0, [r5 + 19 * 16]
20434pmulhrsw m3, m7
20435pmaddubsw m5, m2, [r5 + 19 * 16]
20436pmulhrsw m5, m7
20437packuswb m3, m5
20438movu [r0 + 656 * 16], m3
20439pmaddubsw m3, m1, [r5 + 19 * 16]
20440pmulhrsw m3, m7
20441pmaddubsw m5, m4, [r5 + 19 * 16]
20442pmulhrsw m5, m7
20443packuswb m3, m5
20444movu [r0 + 657 * 16], m3
20445
20446; mode 12 [row 9]
20447pmaddubsw m3, m0, [r5 + 14 * 16]
20448pmulhrsw m3, m7
20449pmaddubsw m5, m2, [r5 + 14 * 16]
20450pmulhrsw m5, m7
20451packuswb m3, m5
20452movu [r0 + 658 * 16], m3
20453pmaddubsw m3, m1, [r5 + 14 * 16]
20454pmulhrsw m3, m7
20455pmaddubsw m5, m4, [r5 + 14 * 16]
20456pmulhrsw m5, m7
20457packuswb m3, m5
20458movu [r0 + 659 * 16], m3
20459
20460; mode 12 [row 10]
20461pmaddubsw m3, m0, [r5 + 9 * 16]
20462pmulhrsw m3, m7
20463pmaddubsw m5, m2, [r5 + 9 * 16]
20464pmulhrsw m5, m7
20465packuswb m3, m5
20466movu [r0 + 660 * 16], m3
20467pmaddubsw m3, m1, [r5 + 9 * 16]
20468pmulhrsw m3, m7
20469pmaddubsw m5, m4, [r5 + 9 * 16]
20470pmulhrsw m5, m7
20471packuswb m3, m5
20472movu [r0 + 661 * 16], m3
20473
20474; mode 12 [row 11]
20475pmaddubsw m3, m0, [r5 + 4 * 16]
20476pmulhrsw m3, m7
20477pmaddubsw m5, m2, [r5 + 4 * 16]
20478pmulhrsw m5, m7
20479packuswb m3, m5
20480movu [r0 + 662 * 16], m3
20481pmaddubsw m3, m1, [r5 + 4 * 16]
20482pmulhrsw m3, m7
20483pmaddubsw m5, m4, [r5 + 4 * 16]
20484pmulhrsw m5, m7
20485packuswb m3, m5
20486movu [r0 + 663 * 16], m3
20487
20488; mode 13 [row 3]
20489movu m6, m0
20490pinsrb m6, [r3 + 4], 0
20491pmaddubsw m3, m6, [r5 + 28 * 16]
20492pmulhrsw m3, m7
20493pmaddubsw m5, m2, [r5 + 28 * 16]
20494pmulhrsw m5, m7
20495packuswb m3, m5
20496movu [r0 + 710 * 16], m3
20497pmaddubsw m3, m1, [r5 + 28 * 16]
20498pmulhrsw m3, m7
20499pmaddubsw m5, m4, [r5 + 28 * 16]
20500pmulhrsw m5, m7
20501packuswb m3, m5
20502movu [r0 + 711 * 16], m3
20503
20504; mode 13 [row 4]
20505pmaddubsw m3, m6, [r5 + 19 * 16]
20506pmulhrsw m3, m7
20507pmaddubsw m5, m2, [r5 + 19 * 16]
20508pmulhrsw m5, m7
20509packuswb m3, m5
20510movu [r0 + 712 * 16], m3
20511pmaddubsw m3, m1, [r5 + 19 * 16]
20512pmulhrsw m3, m7
20513pmaddubsw m5, m4, [r5 + 19 * 16]
20514pmulhrsw m5, m7
20515packuswb m3, m5
20516movu [r0 + 713 * 16], m3
20517
20518; mode 13 [row 5]
20519pmaddubsw m3, m6, [r5 + 10 * 16]
20520pmulhrsw m3, m7
20521pmaddubsw m5, m2, [r5 + 10 * 16]
20522pmulhrsw m5, m7
20523packuswb m3, m5
20524movu [r0 + 714 * 16], m3
20525pmaddubsw m3, m1, [r5 + 10 * 16]
20526pmulhrsw m3, m7
20527pmaddubsw m5, m4, [r5 + 10 * 16]
20528pmulhrsw m5, m7
20529packuswb m3, m5
20530movu [r0 + 715 * 16], m3
20531
20532; mode 13 [row 6]
20533pmaddubsw m3, m6, [r5 + 1 * 16]
20534pmulhrsw m3, m7
20535pmaddubsw m5, m2, [r5 + 1 * 16]
20536pmulhrsw m5, m7
20537packuswb m3, m5
20538movu [r0 + 716 * 16], m3
20539pmaddubsw m3, m1, [r5 + 1 * 16]
20540pmulhrsw m3, m7
20541pmaddubsw m5, m4, [r5 + 1 * 16]
20542pmulhrsw m5, m7
20543packuswb m3, m5
20544movu [r0 + 717 * 16], m3
20545
20546; mode 14 [row 2]
20547movu m6, m0
20548pinsrb m6, [r4 + 0], 1
20549pinsrb m6, [r3 + 2], 0
20550pmaddubsw m3, m6, [r5 + 25 * 16]
20551pmulhrsw m3, m7
20552pmaddubsw m5, m2, [r5 + 25 * 16]
20553pmulhrsw m5, m7
20554packuswb m3, m5
20555movu [r0 + 772 * 16], m3
20556pmaddubsw m3, m1, [r5 + 25 * 16]
20557pmulhrsw m3, m7
20558pmaddubsw m5, m4, [r5 + 25 * 16]
20559pmulhrsw m5, m7
20560packuswb m3, m5
20561movu [r0 + 773 * 16], m3
20562
20563; mode 14 [row 3]
20564pmaddubsw m3, m6, [r5 + 12 * 16]
20565pmulhrsw m3, m7
20566pmaddubsw m5, m2, [r5 + 12 * 16]
20567pmulhrsw m5, m7
20568packuswb m3, m5
20569movu [r0 + 774 * 16], m3
20570pmaddubsw m3, m1, [r5 + 12 * 16]
20571pmulhrsw m3, m7
20572pmaddubsw m5, m4, [r5 + 12 * 16]
20573pmulhrsw m5, m7
20574packuswb m3, m5
20575movu [r0 + 775 * 16], m3
20576
20577; mode 15 [row 1]
20578pmaddubsw m3, m6, [r5 + 30 * 16]
20579pmulhrsw m3, m7
20580pmaddubsw m5, m2, [r5 + 30 * 16]
20581pmulhrsw m5, m7
20582packuswb m3, m5
20583movu [r0 + 834 * 16], m3
20584pmaddubsw m3, m1, [r5 + 30 * 16]
20585pmulhrsw m3, m7
20586pmaddubsw m5, m4, [r5 + 30 * 16]
20587pmulhrsw m5, m7
20588packuswb m3, m5
20589movu [r0 + 835 * 16], m3
20590
20591; mode 15 [row 2]
20592pmaddubsw m3, m6, [r5 + 13 * 16]
20593pmulhrsw m3, m7
20594pmaddubsw m5, m2, [r5 + 13 * 16]
20595pmulhrsw m5, m7
20596packuswb m3, m5
20597movu [r0 + 836 * 16], m3
20598pmaddubsw m3, m1, [r5 + 13 * 16]
20599pmulhrsw m3, m7
20600pmaddubsw m5, m4, [r5 + 13 * 16]
20601pmulhrsw m5, m7
20602packuswb m3, m5
20603movu [r0 + 837 * 16], m3
20604
20605; mode 15 [row 3]
20606pslldq m6, 2
20607pinsrb m6, [r3 + 2], 1
20608pinsrb m6, [r3 + 4], 0
20609pmaddubsw m3, m6, [r5 + 28 * 16]
20610pmulhrsw m3, m7
20611pslldq m2, 2
20612pinsrb m2, [r4 + 7], 1
20613pinsrb m2, [r4 + 6], 0
20614pmaddubsw m5, m2, [r5 + 28 * 16]
20615pmulhrsw m5, m7
20616packuswb m3, m5
20617movu [r0 + 838 * 16], m3
20618pslldq m1, 2
20619pinsrb m1, [r4 + 15], 1
20620pinsrb m1, [r4 + 14], 0
20621pmaddubsw m3, m1, [r5 + 28 * 16]
20622pmulhrsw m3, m7
20623pslldq m4, 2
20624pinsrb m4, [r4 + 23], 1
20625pinsrb m4, [r4 + 22], 0
20626pmaddubsw m5, m4, [r5 + 28 * 16]
20627pmulhrsw m5, m7
20628packuswb m3, m5
20629movu [r0 + 839 * 16], m3
20630
20631; mode 15 [row 4]
20632pmaddubsw m3, m6, [r5 + 11 * 16]
20633pmulhrsw m3, m7
20634pmaddubsw m5, m2, [r5 + 11 * 16]
20635pmulhrsw m5, m7
20636packuswb m3, m5
20637movu [r0 + 840 * 16], m3
20638pmaddubsw m3, m1, [r5 + 11 * 16]
20639pmulhrsw m3, m7
20640pmaddubsw m5, m4, [r5 + 11 * 16]
20641pmulhrsw m5, m7
20642packuswb m3, m5
20643movu [r0 + 841 * 16], m3
20644
20645; mode 15 [row 5, 0-7]
20646pslldq m6, 2
20647pinsrb m6, [r3 + 4], 1
20648pinsrb m6, [r3 + 6], 0
20649pmaddubsw m3, m6, [r5 + 26 * 16]
20650pmulhrsw m3, m7
20651packuswb m3, m3
20652movh [r0 + 842 * 16], m3
20653
20654; mode 15 [row 6, 0-7]
20655pmaddubsw m3, m6, [r5 + 9 * 16]
20656pmulhrsw m3, m7
20657packuswb m3, m3
20658movh [r0 + 844 * 16], m3
20659
20660; mode 15 [row 7, 0-7]
20661pslldq m6, 2
20662pinsrb m6, [r3 + 6], 1
20663pinsrb m6, [r3 + 8], 0
20664pmaddubsw m3, m6, [r5 + 24 * 16]
20665pmulhrsw m3, m7
20666packuswb m3, m3
20667movh [r0 + 846 * 16], m3
20668
20669; mode 15 [row 8, 0-7]
20670pmaddubsw m3, m6, [r5 + 7 * 16]
20671pmulhrsw m3, m7
20672packuswb m3, m3
20673movh [r0 + 848 * 16], m3
20674
20675; mode 15 [row 9, 0-7]
20676pslldq m6, 2
20677pinsrb m6, [r3 + 8], 1
20678pinsrb m6, [r3 + 9], 0
20679pmaddubsw m3, m6, [r5 + 22 * 16]
20680pmulhrsw m3, m7
20681packuswb m3, m3
20682movh [r0 + 850 * 16], m3
20683
20684; mode 15 [row 10, 0-7]
20685pmaddubsw m3, m6, [r5 + 5 * 16]
20686pmulhrsw m3, m7
20687packuswb m3, m3
20688movh [r0 + 852 * 16], m3
20689
20690; mode 15 [row 11, 0-7]
20691pslldq m6, 2
20692pinsrb m6, [r3 + 9], 1
20693pinsrb m6, [r3 + 11], 0
20694pmaddubsw m3, m6, [r5 + 20 * 16]
20695pmulhrsw m3, m7
20696packuswb m3, m3
20697movh [r0 + 854 * 16], m3
20698
20699; mode 15 [row 12, 0-7]
20700pmaddubsw m3, m6, [r5 + 3 * 16]
20701pmulhrsw m3, m7
20702packuswb m3, m3
20703movh [r0 + 856 * 16], m3
20704
20705; mode 15 [row 13, 0-7]
20706pslldq m6, 2
20707pinsrb m6, [r3 + 11], 1
20708pinsrb m6, [r3 + 13], 0
20709pmaddubsw m3, m6, [r5 + 18 * 16]
20710pmulhrsw m3, m7
20711packuswb m3, m3
20712movh [r0 + 858 * 16], m3
20713
20714; mode 15 [row 14, 0-7]
20715pmaddubsw m3, m6, [r5 + 1 * 16]
20716pmulhrsw m3, m7
20717packuswb m3, m3
20718movh [r0 + 860 * 16], m3
20719
20720; mode 15 [row 15, 0-7]
20721pslldq m6, 2
20722pinsrb m6, [r3 + 13], 1
20723pinsrb m6, [r3 + 15], 0
20724pmaddubsw m3, m6, [r5 + 16 * 16]
20725pmulhrsw m3, m7
20726packuswb m3, m3
20727movh [r0 + 862 * 16], m3
20728
20729; mode 15 [row 16, 0-7]
20730pslldq m6, 2
20731pinsrb m6, [r3 + 15], 1
20732pinsrb m6, [r3 + 17], 0
20733pmaddubsw m3, m6, [r5 + 31 * 16]
20734pmulhrsw m3, m7
20735packuswb m3, m3
20736movh [r0 + 864 * 16], m3
20737
20738; mode 15 [row 17, 0-7]
20739pmaddubsw m3, m6, [r5 + 14 * 16]
20740pmulhrsw m3, m7
20741packuswb m3, m3
20742movh [r0 + 866 * 16], m3
20743
20744; mode 15 [row 18, 0-7]
20745pslldq m6, 2
20746pinsrb m6, [r3 + 17], 1
20747pinsrb m6, [r3 + 19], 0
20748pmaddubsw m3, m6, [r5 + 29 * 16]
20749pmulhrsw m3, m7
20750packuswb m3, m3
20751movh [r0 + 868 * 16], m3
20752
20753; mode 15 [row 19, 0-7]
20754pmaddubsw m3, m6, [r5 + 12 * 16]
20755pmulhrsw m3, m7
20756packuswb m3, m3
20757movh [r0 + 870 * 16], m3
20758
20759; mode 15 [row 20, 0-7]
20760pslldq m6, 2
20761pinsrb m6, [r3 + 19], 1
20762pinsrb m6, [r3 + 21], 0
20763pmaddubsw m3, m6, [r5 + 27 * 16]
20764pmulhrsw m3, m7
20765packuswb m3, m3
20766movh [r0 + 872 * 16], m3
20767
20768; mode 15 [row 21, 0-7]
20769pmaddubsw m3, m6, [r5 + 10 * 16]
20770pmulhrsw m3, m7
20771packuswb m3, m3
20772movh [r0 + 874 * 16], m3
20773
20774; mode 15 [row 22, 0-7]
20775pslldq m6, 2
20776pinsrb m6, [r3 + 21], 1
20777pinsrb m6, [r3 + 23], 0
20778pmaddubsw m3, m6, [r5 + 25 * 16]
20779pmulhrsw m3, m7
20780packuswb m3, m3
20781movh [r0 + 876 * 16], m3
20782
20783; mode 15 [row 23, 0-7]
20784pmaddubsw m3, m6, [r5 + 8 * 16]
20785pmulhrsw m3, m7
20786packuswb m3, m3
20787movh [r0 + 878 * 16], m3
20788
20789; mode 15 [row 24, 0-7]
20790pslldq m6, 2
20791pinsrb m6, [r3 + 23], 1
20792pinsrb m6, [r3 + 24], 0
20793pmaddubsw m3, m6, [r5 + 23 * 16]
20794pmulhrsw m3, m7
20795packuswb m3, m3
20796movh [r0 + 880 * 16], m3
20797
20798; mode 15 [row 25, 0-7]
20799pmaddubsw m3, m6, [r5 + 6 * 16]
20800pmulhrsw m3, m7
20801packuswb m3, m3
20802movh [r0 + 882 * 16], m3
20803
20804; mode 15 [row 26, 0-7]
20805pslldq m6, 2
20806pinsrb m6, [r3 + 24], 1
20807pinsrb m6, [r3 + 26], 0
20808pmaddubsw m3, m6, [r5 + 21 * 16]
20809pmulhrsw m3, m7
20810packuswb m3, m3
20811movh [r0 + 884 * 16], m3
20812
20813; mode 15 [row 27, 0-7]
20814pmaddubsw m3, m6, [r5 + 4 * 16]
20815pmulhrsw m3, m7
20816packuswb m3, m3
20817movh [r0 + 886 * 16], m3
20818
20819; mode 15 [row 28, 0-7]
20820pslldq m6, 2
20821pinsrb m6, [r3 + 26], 1
20822pinsrb m6, [r3 + 28], 0
20823pmaddubsw m3, m6, [r5 + 19 * 16]
20824pmulhrsw m3, m7
20825packuswb m3, m3
20826movh [r0 + 888 * 16], m3
20827
20828; mode 15 [row 29, 0-7]
20829pmaddubsw m3, m6, [r5 + 2 * 16]
20830pmulhrsw m3, m7
20831packuswb m3, m3
20832movh [r0 + 890 * 16], m3
20833
20834; mode 15 [row 30, 0-7]
20835pslldq m6, 2
20836pinsrb m6, [r3 + 28], 1
20837pinsrb m6, [r3 + 30], 0
20838pmaddubsw m3, m6, [r5 + 17 * 16]
20839pmulhrsw m3, m7
20840packuswb m3, m3
20841movh [r0 + 892 * 16], m3
20842
20843; mode 15 [row 31, 0-7]
20844pshufb m3, m6, [tab_S2]
20845movh [r0 + 894 * 16], m3
20846
20847; mode 12 [row 12]
20848pslldq m0, 2
20849pinsrb m0, [r3 + 6], 1
20850pinsrb m0, [r3 + 13], 0
20851pmaddubsw m3, m0, [r5 + 31 * 16]
20852pmulhrsw m3, m7
20853pmaddubsw m5, m2, [r5 + 31 * 16]
20854pmulhrsw m5, m7
20855packuswb m3, m5
20856movu [r0 + 664 * 16], m3
20857pmaddubsw m3, m1, [r5 + 31 * 16]
20858pmulhrsw m3, m7
20859pmaddubsw m5, m4, [r5 + 31 * 16]
20860pmulhrsw m5, m7
20861packuswb m3, m5
20862movu [r0 + 665 * 16], m3
20863
20864; mode 12 [row 13]
20865pmaddubsw m3, m0, [r5 + 26 * 16]
20866pmulhrsw m3, m7
20867pmaddubsw m5, m2, [r5 + 26 * 16]
20868pmulhrsw m5, m7
20869packuswb m3, m5
20870movu [r0 + 666 * 16], m3
20871pmaddubsw m3, m1, [r5 + 26 * 16]
20872pmulhrsw m3, m7
20873pmaddubsw m5, m4, [r5 + 26 * 16]
20874pmulhrsw m5, m7
20875packuswb m3, m5
20876movu [r0 + 667 * 16], m3
20877
20878; mode 12 [row 14]
20879pmaddubsw m3, m0, [r5 + 21 * 16]
20880pmulhrsw m3, m7
20881pmaddubsw m5, m2, [r5 + 21 * 16]
20882pmulhrsw m5, m7
20883packuswb m3, m5
20884movu [r0 + 668 * 16], m3
20885pmaddubsw m3, m1, [r5 + 21 * 16]
20886pmulhrsw m3, m7
20887pmaddubsw m5, m4, [r5 + 21 * 16]
20888pmulhrsw m5, m7
20889packuswb m3, m5
20890movu [r0 + 669 * 16], m3
20891
20892; mode 12 [row 15]
20893pmaddubsw m3, m0, [r5 + 16 * 16]
20894pmulhrsw m3, m7
20895pmaddubsw m5, m2, [r5 + 16 * 16]
20896pmulhrsw m5, m7
20897packuswb m3, m5
20898movu [r0 + 670 * 16], m3
20899pmaddubsw m3, m1, [r5 + 16 * 16]
20900pmulhrsw m3, m7
20901pmaddubsw m5, m4, [r5 + 16 * 16]
20902pmulhrsw m5, m7
20903packuswb m3, m5
20904movu [r0 + 671 * 16], m3
20905
20906; mode 12 [row 16]
20907pmaddubsw m3, m0, [r5 + 11 * 16]
20908pmulhrsw m3, m7
20909pmaddubsw m5, m2, [r5 + 11 * 16]
20910pmulhrsw m5, m7
20911packuswb m3, m5
20912movu [r0 + 672 * 16], m3
20913pmaddubsw m3, m1, [r5 + 11 * 16]
20914pmulhrsw m3, m7
20915pmaddubsw m5, m4, [r5 + 11 * 16]
20916pmulhrsw m5, m7
20917packuswb m3, m5
20918movu [r0 + 673 * 16], m3
20919
20920; mode 12 [row 17]
20921pmaddubsw m3, m0, [r5 + 6 * 16]
20922pmulhrsw m3, m7
20923pmaddubsw m5, m2, [r5 + 6 * 16]
20924pmulhrsw m5, m7
20925packuswb m3, m5
20926movu [r0 + 674 * 16], m3
20927pmaddubsw m3, m1, [r5 + 6 * 16]
20928pmulhrsw m3, m7
20929pmaddubsw m5, m4, [r5 + 6 * 16]
20930pmulhrsw m5, m7
20931packuswb m3, m5
20932movu [r0 + 675 * 16], m3
20933
20934; mode 12 [row 18]
20935pmaddubsw m3, m0, [r5 + 1 * 16]
20936pmulhrsw m3, m7
20937pmaddubsw m5, m2, [r5 + 1 * 16]
20938pmulhrsw m5, m7
20939packuswb m3, m5
20940movu [r0 + 676 * 16], m3
20941pmaddubsw m3, m1, [r5 + 1 * 16]
20942pmulhrsw m3, m7
20943pmaddubsw m5, m4, [r5 + 1 * 16]
20944pmulhrsw m5, m7
20945packuswb m3, m5
20946movu [r0 + 677 * 16], m3
20947
20948; mode 13 [row 7]
20949movu m6, m0
20950pinsrb m6, [r3 + 4], 2
20951pinsrb m6, [r3 + 4], 1
20952pinsrb m6, [r3 + 7], 0
20953pmaddubsw m3, m6, [r5 + 24 * 16]
20954pmulhrsw m3, m7
20955pmaddubsw m5, m2, [r5 + 24 * 16]
20956pmulhrsw m5, m7
20957packuswb m3, m5
20958movu [r0 + 718 * 16], m3
20959pmaddubsw m3, m1, [r5 + 24 * 16]
20960pmulhrsw m3, m7
20961pmaddubsw m5, m4, [r5 + 24 * 16]
20962pmulhrsw m5, m7
20963packuswb m3, m5
20964movu [r0 + 719 * 16], m3
20965
20966; mode 13 [row 8]
20967pmaddubsw m3, m6, [r5 + 15 * 16]
20968pmulhrsw m3, m7
20969pmaddubsw m5, m2, [r5 + 15 * 16]
20970pmulhrsw m5, m7
20971packuswb m3, m5
20972movu [r0 + 720 * 16], m3
20973pmaddubsw m3, m1, [r5 + 15 * 16]
20974pmulhrsw m3, m7
20975pmaddubsw m5, m4, [r5 + 15 * 16]
20976pmulhrsw m5, m7
20977packuswb m3, m5
20978movu [r0 + 721 * 16], m3
20979
20980; mode 13 [row 9]
20981pmaddubsw m3, m6, [r5 + 6 * 16]
20982pmulhrsw m3, m7
20983pmaddubsw m5, m2, [r5 + 6 * 16]
20984pmulhrsw m5, m7
20985packuswb m3, m5
20986movu [r0 + 722 * 16], m3
20987pmaddubsw m3, m1, [r5 + 6 * 16]
20988pmulhrsw m3, m7
20989pmaddubsw m5, m4, [r5 + 6 * 16]
20990pmulhrsw m5, m7
20991packuswb m3, m5
20992movu [r0 + 723 * 16], m3
20993
20994; mode 14 [row 4]
20995pinsrb m6, [r3 + 2], 2
20996pinsrb m6, [r3 + 2], 1
20997pinsrb m6, [r3 + 5], 0
20998pmaddubsw m3, m6, [r5 + 31 * 16]
20999pmulhrsw m3, m7
21000pmaddubsw m5, m2, [r5 + 31 * 16]
21001pmulhrsw m5, m7
21002packuswb m3, m5
21003movu [r0 + 776 * 16], m3
21004pmaddubsw m3, m1, [r5 + 31 * 16]
21005pmulhrsw m3, m7
21006pmaddubsw m5, m4, [r5 + 31 * 16]
21007pmulhrsw m5, m7
21008packuswb m3, m5
21009movu [r0 + 777 * 16], m3
21010
21011; mode 14 [row 5]
21012pmaddubsw m3, m6, [r5 + 18 * 16]
21013pmulhrsw m3, m7
21014pmaddubsw m5, m2, [r5 + 18 * 16]
21015pmulhrsw m5, m7
21016packuswb m3, m5
21017movu [r0 + 778 * 16], m3
21018pmaddubsw m3, m1, [r5 + 18 * 16]
21019pmulhrsw m3, m7
21020pmaddubsw m5, m4, [r5 + 18 * 16]
21021pmulhrsw m5, m7
21022packuswb m3, m5
21023movu [r0 + 779 * 16], m3
21024
21025; mode 14 [row 6]
21026pmaddubsw m3, m6, [r5 + 5 * 16]
21027pmulhrsw m3, m7
21028pmaddubsw m5, m2, [r5 + 5 * 16]
21029pmulhrsw m5, m7
21030packuswb m3, m5
21031movu [r0 + 780 * 16], m3
21032pmaddubsw m3, m1, [r5 + 5 * 16]
21033pmulhrsw m3, m7
21034pmaddubsw m5, m4, [r5 + 5 * 16]
21035pmulhrsw m5, m7
21036packuswb m3, m5
21037movu [r0 + 781 * 16], m3
21038
21039; mode 14 [row 7]
21040pslldq m6, 2
21041pinsrb m6, [r3 + 5], 1
21042pinsrb m6, [r3 + 7], 0
21043pmaddubsw m3, m6, [r5 + 24 * 16]
21044pmulhrsw m3, m7
21045pslldq m2, 2
21046pinsrw m2, [r4 + 5], 0
21047pmaddubsw m5, m2, [r5 + 24 * 16]
21048pmulhrsw m5, m7
21049packuswb m3, m5
21050movu [r0 + 782 * 16], m3
21051pslldq m1, 2
21052pinsrw m1, [r4 + 13], 0
21053pmaddubsw m3, m1, [r5 + 24 * 16]
21054pmulhrsw m3, m7
21055pslldq m4, 2
21056pinsrw m4, [r4 + 21], 0
21057pmaddubsw m5, m4, [r5 + 24 * 16]
21058pmulhrsw m5, m7
21059packuswb m3, m5
21060movu [r0 + 783 * 16], m3
21061
21062; mode 14 [row 8]
21063pmaddubsw m3, m6, [r5 + 11 * 16]
21064pmulhrsw m3, m7
21065pmaddubsw m5, m2, [r5 + 11 * 16]
21066pmulhrsw m5, m7
21067packuswb m3, m5
21068movu [r0 + 784 * 16], m3
21069pmaddubsw m3, m1, [r5 + 11 * 16]
21070pmulhrsw m3, m7
21071pmaddubsw m5, m4, [r5 + 11 * 16]
21072pmulhrsw m5, m7
21073packuswb m3, m5
21074movu [r0 + 785 * 16], m3
21075
21076; mode 15 [row 5, 8-31]
21077pmaddubsw m5, m2, [r5 + 26 * 16]
21078pmulhrsw m5, m7
21079packuswb m5, m5
21080movh [r0 + 842 * 16 + 8], m5
21081pmaddubsw m3, m1, [r5 + 26 * 16]
21082pmulhrsw m3, m7
21083pmaddubsw m5, m4, [r5 + 26 * 16]
21084pmulhrsw m5, m7
21085packuswb m3, m5
21086movu [r0 + 843 * 16], m3
21087
21088; mode 15 [row 6, 8-31]
21089pmaddubsw m5, m2, [r5 + 9 * 16]
21090pmulhrsw m5, m7
21091packuswb m5, m5
21092movh [r0 + 844 * 16 + 8], m5
21093pmaddubsw m3, m1, [r5 + 9 * 16]
21094pmulhrsw m3, m7
21095pmaddubsw m5, m4, [r5 + 9 * 16]
21096pmulhrsw m5, m7
21097packuswb m3, m5
21098movu [r0 + 845 * 16], m3
21099
21100; mode 12 [row 19]
21101pslldq m0, 2
21102pinsrb m0, [r3 + 13], 1
21103pinsrb m0, [r3 + 19], 0
21104pmaddubsw m3, m0, [r5 + 28 * 16]
21105pmulhrsw m3, m7
21106pmaddubsw m5, m2, [r5 + 28 * 16]
21107pmulhrsw m5, m7
21108packuswb m3, m5
21109movu [r0 + 678 * 16], m3
21110pmaddubsw m3, m1, [r5 + 28 * 16]
21111pmulhrsw m3, m7
21112pmaddubsw m5, m4, [r5 + 28 * 16]
21113pmulhrsw m5, m7
21114packuswb m3, m5
21115movu [r0 + 679 * 16], m3
21116
21117; mode 12 [row 20]
21118pmaddubsw m3, m0, [r5 + 23 * 16]
21119pmulhrsw m3, m7
21120pmaddubsw m5, m2, [r5 + 23 * 16]
21121pmulhrsw m5, m7
21122packuswb m3, m5
21123movu [r0 + 680 * 16], m3
21124pmaddubsw m3, m1, [r5 + 23 * 16]
21125pmulhrsw m3, m7
21126pmaddubsw m5, m4, [r5 + 23 * 16]
21127pmulhrsw m5, m7
21128packuswb m3, m5
21129movu [r0 + 681 * 16], m3
21130
21131; mode 12 [row 21]
21132pmaddubsw m3, m0, [r5 + 18 * 16]
21133pmulhrsw m3, m7
21134pmaddubsw m5, m2, [r5 + 18 * 16]
21135pmulhrsw m5, m7
21136packuswb m3, m5
21137movu [r0 + 682 * 16], m3
21138pmaddubsw m3, m1, [r5 + 18 * 16]
21139pmulhrsw m3, m7
21140pmaddubsw m5, m4, [r5 + 18 * 16]
21141pmulhrsw m5, m7
21142packuswb m3, m5
21143movu [r0 + 683 * 16], m3
21144
21145; mode 12 [row 22]
21146pmaddubsw m3, m0, [r5 + 13 * 16]
21147pmulhrsw m3, m7
21148pmaddubsw m5, m2, [r5 + 13 * 16]
21149pmulhrsw m5, m7
21150packuswb m3, m5
21151movu [r0 + 684 * 16], m3
21152pmaddubsw m3, m1, [r5 + 13 * 16]
21153pmulhrsw m3, m7
21154pmaddubsw m5, m4, [r5 + 13 * 16]
21155pmulhrsw m5, m7
21156packuswb m3, m5
21157movu [r0 + 685 * 16], m3
21158
21159; mode 12 [row 23]
21160pmaddubsw m3, m0, [r5 + 8 * 16]
21161pmulhrsw m3, m7
21162pmaddubsw m5, m2, [r5 + 8 * 16]
21163pmulhrsw m5, m7
21164packuswb m3, m5
21165movu [r0 + 686 * 16], m3
21166pmaddubsw m3, m1, [r5 + 8 * 16]
21167pmulhrsw m3, m7
21168pmaddubsw m5, m4, [r5 + 8 * 16]
21169pmulhrsw m5, m7
21170packuswb m3, m5
21171movu [r0 + 687 * 16], m3
21172
21173; mode 12 [row 24]
21174pmaddubsw m3, m0, [r5 + 3 * 16]
21175pmulhrsw m3, m7
21176pmaddubsw m5, m2, [r5 + 3 * 16]
21177pmulhrsw m5, m7
21178packuswb m3, m5
21179movu [r0 + 688 * 16], m3
21180pmaddubsw m3, m1, [r5 + 3 * 16]
21181pmulhrsw m3, m7
21182pmaddubsw m5, m4, [r5 + 3 * 16]
21183pmulhrsw m5, m7
21184packuswb m3, m5
21185movu [r0 + 689 * 16], m3
21186
21187; mode 13 [row 10]
21188movu m7, m6
21189movu m6, m0
21190pinsrb m6, [r3 + 4], 4
21191pinsrb m6, [r3 + 4], 3
21192pinsrb m6, [r3 + 7], 2
21193pinsrb m6, [r3 + 7], 1
21194pinsrb m6, [r3 + 11], 0
21195pmaddubsw m3, m6, [r5 + 29 * 16]
21196pmulhrsw m3, [pw_1024]
21197pmaddubsw m5, m2, [r5 + 29 * 16]
21198pmulhrsw m5, [pw_1024]
21199packuswb m3, m5
21200movu [r0 + 724 * 16], m3
21201pmaddubsw m3, m1, [r5 + 29 * 16]
21202pmulhrsw m3, [pw_1024]
21203pmaddubsw m5, m4, [r5 + 29 * 16]
21204pmulhrsw m5, [pw_1024]
21205packuswb m3, m5
21206movu [r0 + 725 * 16], m3
21207
21208; mode 13 [row 11]
21209pmaddubsw m3, m6, [r5 + 20 * 16]
21210pmulhrsw m3, [pw_1024]
21211pmaddubsw m5, m2, [r5 + 20 * 16]
21212pmulhrsw m5, [pw_1024]
21213packuswb m3, m5
21214movu [r0 + 726 * 16], m3
21215pmaddubsw m3, m1, [r5 + 20 * 16]
21216pmulhrsw m3, [pw_1024]
21217pmaddubsw m5, m4, [r5 + 20 * 16]
21218pmulhrsw m5, [pw_1024]
21219packuswb m3, m5
21220movu [r0 + 727 * 16], m3
21221
21222; mode 13 [row 12]
21223pmaddubsw m3, m6, [r5 + 11 * 16]
21224pmulhrsw m3, [pw_1024]
21225pmaddubsw m5, m2, [r5 + 11 * 16]
21226pmulhrsw m5, [pw_1024]
21227packuswb m3, m5
21228movu [r0 + 728 * 16], m3
21229pmaddubsw m3, m1, [r5 + 11 * 16]
21230pmulhrsw m3, [pw_1024]
21231pmaddubsw m5, m4, [r5 + 11 * 16]
21232pmulhrsw m5, [pw_1024]
21233packuswb m3, m5
21234movu [r0 + 729 * 16], m3
21235
21236; mode 13 [row 13]
21237pmaddubsw m3, m6, [r5 + 2 * 16]
21238pmulhrsw m3, [pw_1024]
21239pmaddubsw m5, m2, [r5 + 2 * 16]
21240pmulhrsw m5, [pw_1024]
21241packuswb m3, m5
21242movu [r0 + 730 * 16], m3
21243pmaddubsw m3, m1, [r5 + 2 * 16]
21244pmulhrsw m3, [pw_1024]
21245pmaddubsw m5, m4, [r5 + 2 * 16]
21246pmulhrsw m5, [pw_1024]
21247packuswb m3, m5
21248movu [r0 + 731 * 16], m3
21249
21250; mode 14 [row 9]
21251pslldq m7, 2
21252pinsrb m7, [r3 + 7], 1
21253pinsrb m7, [r3 + 10], 0
21254pmaddubsw m3, m7, [r5 + 30 * 16]
21255pmulhrsw m3, [pw_1024]
21256pslldq m2, 2
21257pinsrw m2, [r4 + 4], 0
21258pmaddubsw m5, m2, [r5 + 30 * 16]
21259pmulhrsw m5, [pw_1024]
21260packuswb m3, m5
21261movu [r0 + 786 * 16], m3
21262pslldq m1, 2
21263pinsrw m1, [r4 + 12], 0
21264pmaddubsw m3, m1, [r5 + 30 * 16]
21265pmulhrsw m3, [pw_1024]
21266pslldq m4, 2
21267pinsrb m4, [r4 + 21], 1
21268pinsrb m4, [r4 + 20], 0
21269pmaddubsw m5, m4, [r5 + 30 * 16]
21270pmulhrsw m5, [pw_1024]
21271packuswb m3, m5
21272movu [r0 + 787 * 16], m3
21273
21274; mode 14 [row 10]
21275pmaddubsw m3, m7, [r5 + 17 * 16]
21276pmulhrsw m3, [pw_1024]
21277pmaddubsw m5, m2, [r5 + 17 * 16]
21278pmulhrsw m5, [pw_1024]
21279packuswb m3, m5
21280movu [r0 + 788 * 16], m3
21281pmaddubsw m3, m1, [r5 + 17 * 16]
21282pmulhrsw m3, [pw_1024]
21283pmaddubsw m5, m4, [r5 + 17 * 16]
21284pmulhrsw m5, [pw_1024]
21285packuswb m3, m5
21286movu [r0 + 789 * 16], m3
21287
21288; mode 14 [row 11]
21289pmaddubsw m3, m7, [r5 + 4 * 16]
21290pmulhrsw m3, [pw_1024]
21291pmaddubsw m5, m2, [r5 + 4 * 16]
21292pmulhrsw m5, [pw_1024]
21293packuswb m3, m5
21294movu [r0 + 790 * 16], m3
21295pmaddubsw m3, m1, [r5 + 4 * 16]
21296pmulhrsw m3, [pw_1024]
21297pmaddubsw m5, m4, [r5 + 4 * 16]
21298pmulhrsw m5, [pw_1024]
21299packuswb m3, m5
21300movu [r0 + 791 * 16], m3
21301
21302movu m6, [pw_1024]
21303
21304; mode 15 [row 7, 8-31]
21305pmaddubsw m5, m2, [r5 + 24 * 16]
21306pmulhrsw m5, m6
21307packuswb m5, m5
21308movh [r0 + 846 * 16 + 8], m5
21309pmaddubsw m3, m1, [r5 + 24 * 16]
21310pmulhrsw m3, m6
21311pmaddubsw m5, m4, [r5 + 24 * 16]
21312pmulhrsw m5, m6
21313packuswb m3, m5
21314movu [r0 + 847 * 16], m3
21315
21316; mode 15 [row 8, 8-31]
21317pmaddubsw m5, m2, [r5 + 7 * 16]
21318pmulhrsw m5, m6
21319packuswb m5, m5
21320movh [r0 + 848 * 16 + 8], m5
21321pmaddubsw m3, m1, [r5 + 7 * 16]
21322pmulhrsw m3, m6
21323pmaddubsw m5, m4, [r5 + 7 * 16]
21324pmulhrsw m5, m6
21325packuswb m3, m5
21326movu [r0 + 849 * 16], m3
21327
21328; mode 12 [row 25]
21329pslldq m0, 2
21330pinsrb m0, [r3 + 19], 1
21331pinsrb m0, [r3 + 26], 0
21332pmaddubsw m3, m0, [r5 + 30 * 16]
21333pmulhrsw m3, [pw_1024]
21334pmaddubsw m5, m2, [r5 + 30 * 16]
21335pmulhrsw m5, [pw_1024]
21336packuswb m3, m5
21337movu [r0 + 690 * 16], m3
21338pmaddubsw m3, m1, [r5 + 30 * 16]
21339pmulhrsw m3, [pw_1024]
21340pmaddubsw m5, m4, [r5 + 30 * 16]
21341pmulhrsw m5, [pw_1024]
21342packuswb m3, m5
21343movu [r0 + 691 * 16], m3
21344
21345; mode 12 [row 26]
21346pmaddubsw m3, m0, [r5 + 25 * 16]
21347pmulhrsw m3, [pw_1024]
21348pmaddubsw m5, m2, [r5 + 25 * 16]
21349pmulhrsw m5, [pw_1024]
21350packuswb m3, m5
21351movu [r0 + 692 * 16], m3
21352pmaddubsw m3, m1, [r5 + 25 * 16]
21353pmulhrsw m3, [pw_1024]
21354pmaddubsw m5, m4, [r5 + 25 * 16]
21355pmulhrsw m5, [pw_1024]
21356packuswb m3, m5
21357movu [r0 + 693 * 16], m3
21358
21359; mode 12 [row 27]
21360pmaddubsw m3, m0, [r5 + 20 * 16]
21361pmulhrsw m3, [pw_1024]
21362pmaddubsw m5, m2, [r5 + 20 * 16]
21363pmulhrsw m5, [pw_1024]
21364packuswb m3, m5
21365movu [r0 + 694 * 16], m3
21366pmaddubsw m3, m1, [r5 + 20 * 16]
21367pmulhrsw m3, [pw_1024]
21368pmaddubsw m5, m4, [r5 + 20 * 16]
21369pmulhrsw m5, [pw_1024]
21370packuswb m3, m5
21371movu [r0 + 695 * 16], m3
21372
21373; mode 12 [row 28]
21374pmaddubsw m3, m0, [r5 + 15 * 16]
21375pmulhrsw m3, [pw_1024]
21376pmaddubsw m5, m2, [r5 + 15 * 16]
21377pmulhrsw m5, [pw_1024]
21378packuswb m3, m5
21379movu [r0 + 696 * 16], m3
21380pmaddubsw m3, m1, [r5 + 15 * 16]
21381pmulhrsw m3, [pw_1024]
21382pmaddubsw m5, m4, [r5 + 15 * 16]
21383pmulhrsw m5, [pw_1024]
21384packuswb m3, m5
21385movu [r0 + 697 * 16], m3
21386
21387; mode 12 [row 29]
21388pmaddubsw m3, m0, [r5 + 10 * 16]
21389pmulhrsw m3, [pw_1024]
21390pmaddubsw m5, m2, [r5 + 10 * 16]
21391pmulhrsw m5, [pw_1024]
21392packuswb m3, m5
21393movu [r0 + 698 * 16], m3
21394pmaddubsw m3, m1, [r5 + 10 * 16]
21395pmulhrsw m3, [pw_1024]
21396pmaddubsw m5, m4, [r5 + 10 * 16]
21397pmulhrsw m5, [pw_1024]
21398packuswb m3, m5
21399movu [r0 + 699 * 16], m3
21400
21401; mode 12 [row 30]
21402pmaddubsw m3, m0, [r5 + 5 * 16]
21403pmulhrsw m3, [pw_1024]
21404pmaddubsw m5, m2, [r5 + 5 * 16]
21405pmulhrsw m5, [pw_1024]
21406packuswb m3, m5
21407movu [r0 + 700 * 16], m3
21408pmaddubsw m3, m1, [r5 + 5 * 16]
21409pmulhrsw m3, [pw_1024]
21410pmaddubsw m5, m4, [r5 + 5 * 16]
21411pmulhrsw m5, [pw_1024]
21412packuswb m3, m5
21413movu [r0 + 701 * 16], m3
21414
21415; mode 13 [row 14]
21416movu m6, m0
21417pinsrb m6, [r3 + 4], 6
21418pinsrb m6, [r3 + 4], 5
21419pinsrb m6, [r3 + 7], 4
21420pinsrb m6, [r3 + 7], 3
21421pinsrb m6, [r3 + 11], 2
21422pinsrb m6, [r3 + 11], 1
21423pinsrb m6, [r3 + 14], 0
21424pmaddubsw m3, m6, [r5 + 25 * 16]
21425pmulhrsw m3, [pw_1024]
21426pmaddubsw m5, m2, [r5 + 25 * 16]
21427pmulhrsw m5, [pw_1024]
21428packuswb m3, m5
21429movu [r0 + 732 * 16], m3
21430pmaddubsw m3, m1, [r5 + 25 * 16]
21431pmulhrsw m3, [pw_1024]
21432pmaddubsw m5, m4, [r5 + 25 * 16]
21433pmulhrsw m5, [pw_1024]
21434packuswb m3, m5
21435movu [r0 + 733 * 16], m3
21436
21437; mode 13 [row 15]
21438pmaddubsw m3, m6, [r5 + 16 * 16]
21439pmulhrsw m3, [pw_1024]
21440pmaddubsw m5, m2, [r5 + 16 * 16]
21441pmulhrsw m5, [pw_1024]
21442packuswb m3, m5
21443movu [r0 + 734 * 16], m3
21444pmaddubsw m3, m1, [r5 + 16 * 16]
21445pmulhrsw m3, [pw_1024]
21446pmaddubsw m5, m4, [r5 + 16 * 16]
21447pmulhrsw m5, [pw_1024]
21448packuswb m3, m5
21449movu [r0 + 735 * 16], m3
21450
21451; mode 13 [row 16]
21452pmaddubsw m3, m6, [r5 + 7 * 16]
21453pmulhrsw m3, [pw_1024]
21454pmaddubsw m5, m2, [r5 + 7 * 16]
21455pmulhrsw m5, [pw_1024]
21456packuswb m3, m5
21457movu [r0 + 736 * 16], m3
21458pmaddubsw m3, m1, [r5 + 7 * 16]
21459pmulhrsw m3, [pw_1024]
21460pmaddubsw m5, m4, [r5 + 7 * 16]
21461pmulhrsw m5, [pw_1024]
21462packuswb m3, m5
21463movu [r0 + 737 * 16], m3
21464
21465; mode 13 [row 17]
21466pslldq m6, 2
21467pinsrb m6, [r3 + 14], 1
21468pinsrb m6, [r3 + 18], 0
21469pmaddubsw m3, m6, [r5 + 30 * 16]
21470pmulhrsw m3, [pw_1024]
21471pslldq m2, 2
21472pinsrw m2, [r4 + 3], 0
21473pmaddubsw m5, m2, [r5 + 30 * 16]
21474pmulhrsw m5, [pw_1024]
21475packuswb m3, m5
21476movu [r0 + 738 * 16], m3
21477pslldq m1, 2
21478pinsrw m1, [r4 + 11], 0
21479pmaddubsw m3, m1, [r5 + 30 * 16]
21480pmulhrsw m3, [pw_1024]
21481pslldq m4, 2
21482pinsrw m4, [r4 + 19], 0
21483pmaddubsw m5, m4, [r5 + 30 * 16]
21484pmulhrsw m5, [pw_1024]
21485packuswb m3, m5
21486movu [r0 + 739 * 16], m3
21487
21488; mode 13 [row 18]
21489pmaddubsw m3, m6, [r5 + 21 * 16]
21490pmulhrsw m3, [pw_1024]
21491pmaddubsw m5, m2, [r5 + 21 * 16]
21492pmulhrsw m5, [pw_1024]
21493packuswb m3, m5
21494movu [r0 + 740 * 16], m3
21495pmaddubsw m3, m1, [r5 + 21 * 16]
21496pmulhrsw m3, [pw_1024]
21497pmaddubsw m5, m4, [r5 + 21 * 16]
21498pmulhrsw m5, [pw_1024]
21499packuswb m3, m5
21500movu [r0 + 741 * 16], m3
21501
21502; mode 13 [row 19]
21503pmaddubsw m3, m6, [r5 + 12 * 16]
21504pmulhrsw m3, [pw_1024]
21505pmaddubsw m5, m2, [r5 + 12 * 16]
21506pmulhrsw m5, [pw_1024]
21507packuswb m3, m5
21508movu [r0 + 742 * 16], m3
21509pmaddubsw m3, m1, [r5 + 12 * 16]
21510pmulhrsw m3, [pw_1024]
21511pmaddubsw m5, m4, [r5 + 12 * 16]
21512pmulhrsw m5, [pw_1024]
21513packuswb m3, m5
21514movu [r0 + 743 * 16], m3
21515
21516; mode 13 [row 20]
21517pmaddubsw m3, m6, [r5 + 3 * 16]
21518pmulhrsw m3, [pw_1024]
21519pmaddubsw m5, m2, [r5 + 3 * 16]
21520pmulhrsw m5, [pw_1024]
21521packuswb m3, m5
21522movu [r0 + 744 * 16], m3
21523pmaddubsw m3, m1, [r5 + 3 * 16]
21524pmulhrsw m3, [pw_1024]
21525pmaddubsw m5, m4, [r5 + 3 * 16]
21526pmulhrsw m5, [pw_1024]
21527packuswb m3, m5
21528movu [r0 + 745 * 16], m3
21529
21530; mode 14 [row 12]
21531pslldq m7, 2
21532pinsrb m7, [r3 + 10], 1
21533pinsrb m7, [r3 + 12], 0
21534pmaddubsw m3, m7, [r5 + 23 * 16]
21535pmulhrsw m3, [pw_1024]
21536pmaddubsw m5, m2, [r5 + 23 * 16]
21537pmulhrsw m5, [pw_1024]
21538packuswb m3, m5
21539movu [r0 + 792 * 16], m3
21540pmaddubsw m3, m1, [r5 + 23 * 16]
21541pmulhrsw m3, [pw_1024]
21542pmaddubsw m5, m4, [r5 + 23 * 16]
21543pmulhrsw m5, [pw_1024]
21544packuswb m3, m5
21545movu [r0 + 793 * 16], m3
21546
21547; mode 14 [row 13]
21548pmaddubsw m3, m7, [r5 + 10 * 16]
21549pmulhrsw m3, [pw_1024]
21550pmaddubsw m5, m2, [r5 + 10 * 16]
21551pmulhrsw m5, [pw_1024]
21552packuswb m3, m5
21553movu [r0 + 794 * 16], m3
21554pmaddubsw m3, m1, [r5 + 10 * 16]
21555pmulhrsw m3, [pw_1024]
21556pmaddubsw m5, m4, [r5 + 10 * 16]
21557pmulhrsw m5, [pw_1024]
21558packuswb m3, m5
21559movu [r0 + 795 * 16], m3
21560
21561; mode 15 [row 9]
21562pmaddubsw m5, m2, [r5 + 22 * 16]
21563pmulhrsw m5, [pw_1024]
21564packuswb m5, m5
21565movu [r0 + 850 * 16 + 8], m5
21566pmaddubsw m3, m1, [r5 + 22 * 16]
21567pmulhrsw m3, [pw_1024]
21568pmaddubsw m5, m4, [r5 + 22 * 16]
21569pmulhrsw m5, [pw_1024]
21570packuswb m3, m5
21571movu [r0 + 851 * 16], m3
21572
21573; mode 15 [row 10]
21574pmaddubsw m5, m2, [r5 + 5 * 16]
21575pmulhrsw m5, [pw_1024]
21576packuswb m5, m5
21577movu [r0 + 852 * 16 + 8], m5
21578pmaddubsw m3, m1, [r5 + 5 * 16]
21579pmulhrsw m3, [pw_1024]
21580pmaddubsw m5, m4, [r5 + 5 * 16]
21581pmulhrsw m5, [pw_1024]
21582packuswb m3, m5
21583movu [r0 + 853 * 16], m3
21584
21585; mode 13 [row 21]
21586pslldq m6, 2
21587pinsrb m6, [r3 + 18], 1
21588pinsrb m6, [r3 + 21], 0
21589pmaddubsw m3, m6, [r5 + 26 * 16]
21590pmulhrsw m3, [pw_1024]
21591pslldq m2, 2
21592pinsrw m2, [r4 + 2], 0
21593pmaddubsw m5, m2, [r5 + 26 * 16]
21594pmulhrsw m5, [pw_1024]
21595packuswb m3, m5
21596movu [r0 + 746 * 16], m3
21597pslldq m1, 2
21598pinsrw m1, [r4 + 10], 0
21599pmaddubsw m3, m1, [r5 + 26 * 16]
21600pmulhrsw m3, [pw_1024]
21601pslldq m4, 2
21602pinsrw m4, [r4 + 18], 0
21603pmaddubsw m5, m4, [r5 + 26 * 16]
21604pmulhrsw m5, [pw_1024]
21605packuswb m3, m5
21606movu [r0 + 747 * 16], m3
21607
21608; mode 13 [row 22]
21609pmaddubsw m3, m6, [r5 + 17 * 16]
21610pmulhrsw m3, [pw_1024]
21611pmaddubsw m5, m2, [r5 + 17 * 16]
21612pmulhrsw m5, [pw_1024]
21613packuswb m3, m5
21614movu [r0 + 748 * 16], m3
21615pmaddubsw m3, m1, [r5 + 17 * 16]
21616pmulhrsw m3, [pw_1024]
21617pmaddubsw m5, m4, [r5 + 17 * 16]
21618pmulhrsw m5, [pw_1024]
21619packuswb m3, m5
21620movu [r0 + 749 * 16], m3
21621
21622; mode 13 [row 23]
21623pmaddubsw m3, m6, [r5 + 8 * 16]
21624pmulhrsw m3, [pw_1024]
21625pmaddubsw m5, m2, [r5 + 8 * 16]
21626pmulhrsw m5, [pw_1024]
21627packuswb m3, m5
21628movu [r0 + 750 * 16], m3
21629pmaddubsw m3, m1, [r5 + 8 * 16]
21630pmulhrsw m3, [pw_1024]
21631pmaddubsw m5, m4, [r5 + 8 * 16]
21632pmulhrsw m5, [pw_1024]
21633packuswb m3, m5
21634movu [r0 + 751 * 16], m3
21635
21636; mode 14 [row 14]
21637pslldq m7, 2
21638pinsrb m7, [r3 + 12], 1
21639pinsrb m7, [r3 + 15], 0
21640pmaddubsw m3, m7, [r5 + 29 * 16]
21641pmulhrsw m3, [pw_1024]
21642pmaddubsw m5, m2, [r5 + 29 * 16]
21643pmulhrsw m5, [pw_1024]
21644packuswb m3, m5
21645movu [r0 + 796 * 16], m3
21646pmaddubsw m3, m1, [r5 + 29 * 16]
21647pmulhrsw m3, [pw_1024]
21648pmaddubsw m5, m4, [r5 + 29 * 16]
21649pmulhrsw m5, [pw_1024]
21650packuswb m3, m5
21651movu [r0 + 797 * 16], m3
21652
21653; mode 14 [row 15]
21654pmaddubsw m3, m7, [r5 + 16 * 16]
21655pmulhrsw m3, [pw_1024]
21656pmaddubsw m5, m2, [r5 + 16 * 16]
21657pmulhrsw m5, [pw_1024]
21658packuswb m3, m5
21659movu [r0 + 798 * 16], m3
21660pmaddubsw m3, m1, [r5 + 16 * 16]
21661pmulhrsw m3, [pw_1024]
21662pmaddubsw m5, m4, [r5 + 16 * 16]
21663pmulhrsw m5, [pw_1024]
21664packuswb m3, m5
21665movu [r0 + 799 * 16], m3
21666
21667; mode 14 [row 16]
21668pmaddubsw m3, m7, [r5 + 3 * 16]
21669pmulhrsw m3, [pw_1024]
21670pmaddubsw m5, m2, [r5 + 3 * 16]
21671pmulhrsw m5, [pw_1024]
21672packuswb m3, m5
21673movu [r0 + 800 * 16], m3
21674pmaddubsw m3, m1, [r5 + 3 * 16]
21675pmulhrsw m3, [pw_1024]
21676pmaddubsw m5, m4, [r5 + 3 * 16]
21677pmulhrsw m5, [pw_1024]
21678packuswb m3, m5
21679movu [r0 + 801 * 16], m3
21680
21681; mode 15 [row 11]
21682pmaddubsw m5, m2, [r5 + 20 * 16]
21683pmulhrsw m5, [pw_1024]
21684packuswb m5, m5
21685movh [r0 + 854 * 16 + 8], m5
21686pmaddubsw m3, m1, [r5 + 20 * 16]
21687pmulhrsw m3, [pw_1024]
21688pmaddubsw m5, m4, [r5 + 20 * 16]
21689pmulhrsw m5, [pw_1024]
21690packuswb m3, m5
21691movu [r0 + 855 * 16], m3
21692
21693; mode 15 [row 12]
21694pmaddubsw m5, m2, [r5 + 3 * 16]
21695pmulhrsw m5, [pw_1024]
21696packuswb m5, m5
21697movh [r0 + 856 * 16 + 8], m5
21698pmaddubsw m3, m1, [r5 + 3 * 16]
21699pmulhrsw m3, [pw_1024]
21700pmaddubsw m5, m4, [r5 + 3 * 16]
21701pmulhrsw m5, [pw_1024]
21702packuswb m3, m5
21703movu [r0 + 857 * 16], m3
21704
21705; mode 13 [row 24]
21706pslldq m6, 2
21707pinsrb m6, [r3 + 21], 1
21708pinsrb m6, [r3 + 25], 0
21709pmaddubsw m3, m6, [r5 + 31 * 16]
21710pmulhrsw m3, [pw_1024]
21711pslldq m2, 2
21712pinsrw m2, [r4 + 1], 0
21713pmaddubsw m5, m2, [r5 + 31 * 16]
21714pmulhrsw m5, [pw_1024]
21715packuswb m3, m5
21716movu [r0 + 752 * 16], m3
21717pslldq m1, 2
21718pinsrw m1, [r4 + 9], 0
21719pmaddubsw m3, m1, [r5 + 31 * 16]
21720pmulhrsw m3, [pw_1024]
21721pslldq m4, 2
21722pinsrw m4, [r4 + 17], 0
21723pmaddubsw m5, m4, [r5 + 31 * 16]
21724pmulhrsw m5, [pw_1024]
21725packuswb m3, m5
21726movu [r0 + 753 * 16], m3
21727
21728; mode 13 [row 25]
21729pmaddubsw m3, m6, [r5 + 22 * 16]
21730pmulhrsw m3, [pw_1024]
21731pmaddubsw m5, m2, [r5 + 22 * 16]
21732pmulhrsw m5, [pw_1024]
21733packuswb m3, m5
21734movu [r0 + 754 * 16], m3
21735pmaddubsw m3, m1, [r5 + 22 * 16]
21736pmulhrsw m3, [pw_1024]
21737pmaddubsw m5, m4, [r5 + 22 * 16]
21738pmulhrsw m5, [pw_1024]
21739packuswb m3, m5
21740movu [r0 + 755 * 16], m3
21741
21742; mode 13 [row 26]
21743pmaddubsw m3, m6, [r5 + 13 * 16]
21744pmulhrsw m3, [pw_1024]
21745pmaddubsw m5, m2, [r5 + 13 * 16]
21746pmulhrsw m5, [pw_1024]
21747packuswb m3, m5
21748movu [r0 + 756 * 16], m3
21749pmaddubsw m3, m1, [r5 + 13 * 16]
21750pmulhrsw m3, [pw_1024]
21751pmaddubsw m5, m4, [r5 + 13 * 16]
21752pmulhrsw m5, [pw_1024]
21753packuswb m3, m5
21754movu [r0 + 757 * 16], m3
21755
21756; mode 13 [row 27]
21757pmaddubsw m3, m6, [r5 + 4 * 16]
21758pmulhrsw m3, [pw_1024]
21759pmaddubsw m5, m2, [r5 + 4 * 16]
21760pmulhrsw m5, [pw_1024]
21761packuswb m3, m5
21762movu [r0 + 758 * 16], m3
21763pmaddubsw m3, m1, [r5 + 4 * 16]
21764pmulhrsw m3, [pw_1024]
21765pmaddubsw m5, m4, [r5 + 4 * 16]
21766pmulhrsw m5, [pw_1024]
21767packuswb m3, m5
21768movu [r0 + 759 * 16], m3
21769
21770; mode 14 [row 17]
21771pslldq m7, 2
21772pinsrb m7, [r3 + 15], 1
21773pinsrb m7, [r3 + 17], 0
21774pmaddubsw m3, m7, [r5 + 22 * 16]
21775pmulhrsw m3, [pw_1024]
21776pmaddubsw m5, m2, [r5 + 22 * 16]
21777pmulhrsw m5, [pw_1024]
21778packuswb m3, m5
21779movu [r0 + 802 * 16], m3
21780pmaddubsw m3, m1, [r5 + 22 * 16]
21781pmulhrsw m3, [pw_1024]
21782pmaddubsw m5, m4, [r5 + 22 * 16]
21783pmulhrsw m5, [pw_1024]
21784packuswb m3, m5
21785movu [r0 + 803 * 16], m3
21786
21787; mode 14 [row 18]
21788pmaddubsw m3, m7, [r5 + 9 * 16]
21789pmulhrsw m3, [pw_1024]
21790pmaddubsw m5, m2, [r5 + 9 * 16]
21791pmulhrsw m5, [pw_1024]
21792packuswb m3, m5
21793movu [r0 + 804 * 16], m3
21794pmaddubsw m3, m1, [r5 + 9 * 16]
21795pmulhrsw m3, [pw_1024]
21796pmaddubsw m5, m4, [r5 + 9 * 16]
21797pmulhrsw m5, [pw_1024]
21798packuswb m3, m5
21799movu [r0 + 805 * 16], m3
21800
21801; mode 15 [row 13]
21802pmaddubsw m5, m2, [r5 + 18 * 16]
21803pmulhrsw m5, [pw_1024]
21804packuswb m5, m5
21805movh [r0 + 858 * 16 + 8], m5
21806pmaddubsw m3, m1, [r5 + 18 * 16]
21807pmulhrsw m3, [pw_1024]
21808pmaddubsw m5, m4, [r5 + 18 * 16]
21809pmulhrsw m5, [pw_1024]
21810packuswb m3, m5
21811movu [r0 + 859 * 16], m3
21812
21813; mode 15 [row 14]
21814pmaddubsw m5, m2, [r5 + 1 * 16]
21815pmulhrsw m5, [pw_1024]
21816packuswb m5, m5
21817movh [r0 + 860 * 16 + 8], m5
21818pmaddubsw m3, m1, [r5 + 1 * 16]
21819pmulhrsw m3, [pw_1024]
21820pmaddubsw m5, m4, [r5 + 1 * 16]
21821pmulhrsw m5, [pw_1024]
21822packuswb m3, m5
21823movu [r0 + 861 * 16], m3
21824
21825; mode 13 [row 28]
21826pslldq m6, 2
21827pinsrb m6, [r3 + 25], 1
21828pinsrb m6, [r3 + 28], 0
21829pmaddubsw m3, m6, [r5 + 27 * 16]
21830pmulhrsw m3, [pw_1024]
21831pslldq m2, 2
21832pinsrw m2, [r4 + 0], 0
21833pmaddubsw m5, m2, [r5 + 27 * 16]
21834pmulhrsw m5, [pw_1024]
21835packuswb m3, m5
21836movu [r0 + 760 * 16], m3
21837pslldq m1, 2
21838pinsrw m1, [r4 + 8], 0
21839pmaddubsw m3, m1, [r5 + 27 * 16]
21840pmulhrsw m3, [pw_1024]
21841pslldq m4, 2
21842pinsrw m4, [r4 + 16], 0
21843pmaddubsw m5, m4, [r5 + 27 * 16]
21844pmulhrsw m5, [pw_1024]
21845packuswb m3, m5
21846movu [r0 + 761 * 16], m3
21847
21848; mode 13 [row 29]
21849pmaddubsw m3, m6, [r5 + 18 * 16]
21850pmulhrsw m3, [pw_1024]
21851pmaddubsw m5, m2, [r5 + 18 * 16]
21852pmulhrsw m5, [pw_1024]
21853packuswb m3, m5
21854movu [r0 + 762 * 16], m3
21855pmaddubsw m3, m1, [r5 + 18 * 16]
21856pmulhrsw m3, [pw_1024]
21857pmaddubsw m5, m4, [r5 + 18 * 16]
21858pmulhrsw m5, [pw_1024]
21859packuswb m3, m5
21860movu [r0 + 763 * 16], m3
21861
21862; mode 13 [row 30]
21863pmaddubsw m3, m6, [r5 + 9 * 16]
21864pmulhrsw m3, [pw_1024]
21865pmaddubsw m5, m2, [r5 + 9 * 16]
21866pmulhrsw m5, [pw_1024]
21867packuswb m3, m5
21868movu [r0 + 764 * 16], m3
21869pmaddubsw m3, m1, [r5 + 9 * 16]
21870pmulhrsw m3, [pw_1024]
21871pmaddubsw m5, m4, [r5 + 9 * 16]
21872pmulhrsw m5, [pw_1024]
21873packuswb m3, m5
21874movu [r0 + 765 * 16], m3
21875
21876; mode 14 [row 19]
21877pslldq m7, 2
21878pinsrb m7, [r3 + 17], 1
21879pinsrb m7, [r3 + 20], 0
21880pmaddubsw m3, m7, [r5 + 28 * 16]
21881pmulhrsw m3, [pw_1024]
21882pmaddubsw m5, m2, [r5 + 28 * 16]
21883pmulhrsw m5, [pw_1024]
21884packuswb m3, m5
21885movu [r0 + 806 * 16], m3
21886pmaddubsw m3, m1, [r5 + 28 * 16]
21887pmulhrsw m3, [pw_1024]
21888pmaddubsw m5, m4, [r5 + 28 * 16]
21889pmulhrsw m5, [pw_1024]
21890packuswb m3, m5
21891movu [r0 + 807 * 16], m3
21892
21893; mode 14 [row 20]
21894pmaddubsw m3, m7, [r5 + 15 * 16]
21895pmulhrsw m3, [pw_1024]
21896pmaddubsw m5, m2, [r5 + 15 * 16]
21897pmulhrsw m5, [pw_1024]
21898packuswb m3, m5
21899movu [r0 + 808 * 16], m3
21900pmaddubsw m3, m1, [r5 + 15 * 16]
21901pmulhrsw m3, [pw_1024]
21902pmaddubsw m5, m4, [r5 + 15 * 16]
21903pmulhrsw m5, [pw_1024]
21904packuswb m3, m5
21905movu [r0 + 809 * 16], m3
21906
21907; mode 14 [row 21]
21908pmaddubsw m3, m7, [r5 + 2 * 16]
21909pmulhrsw m3, [pw_1024]
21910pmaddubsw m5, m2, [r5 + 2 * 16]
21911pmulhrsw m5, [pw_1024]
21912packuswb m3, m5
21913movu [r0 + 810 * 16], m3
21914pmaddubsw m3, m1, [r5 + 2 * 16]
21915pmulhrsw m3, [pw_1024]
21916pmaddubsw m5, m4, [r5 + 2 * 16]
21917pmulhrsw m5, [pw_1024]
21918packuswb m3, m5
21919movu [r0 + 811 * 16], m3
21920
21921; mode 15 [row 15]
21922pmaddubsw m5, m2, [r5 + 16 * 16]
21923pmulhrsw m5, [pw_1024]
21924packuswb m5, m5
21925movh [r0 + 862 * 16 + 8], m5
21926pmaddubsw m3, m1, [r5 + 16 * 16]
21927pmulhrsw m3, [pw_1024]
21928pmaddubsw m5, m4, [r5 + 16 * 16]
21929pmulhrsw m5, [pw_1024]
21930packuswb m3, m5
21931movu [r0 + 863 * 16], m3
21932
21933; mode 14 [row 22]
21934pslldq m7, 2
21935pinsrb m7, [r3 + 20], 1
21936pinsrb m7, [r3 + 22], 0
21937pmaddubsw m3, m7, [r5 + 21 * 16]
21938pmulhrsw m3, [pw_1024]
21939pslldq m2, 2
21940pinsrb m2, [r4 + 0], 1
21941pinsrb m2, [r3 + 2], 0
21942pmaddubsw m5, m2, [r5 + 21 * 16]
21943pmulhrsw m5, [pw_1024]
21944packuswb m3, m5
21945movu [r0 + 812 * 16], m3
21946pslldq m1, 2
21947pinsrw m1, [r4 + 7], 0
21948pmaddubsw m3, m1, [r5 + 21 * 16]
21949pmulhrsw m3, [pw_1024]
21950pslldq m4, 2
21951pinsrw m4, [r4 + 15], 0
21952pmaddubsw m5, m4, [r5 + 21 * 16]
21953pmulhrsw m5, [pw_1024]
21954packuswb m3, m5
21955movu [r0 + 813 * 16], m3
21956
21957; mode 14 [row 23]
21958pmaddubsw m3, m7, [r5 + 8 * 16]
21959pmulhrsw m3, [pw_1024]
21960pmaddubsw m5, m2, [r5 + 8 * 16]
21961pmulhrsw m5, [pw_1024]
21962packuswb m3, m5
21963movu [r0 + 814 * 16], m3
21964pmaddubsw m3, m1, [r5 + 8 * 16]
21965pmulhrsw m3, [pw_1024]
21966pmaddubsw m5, m4, [r5 + 8 * 16]
21967pmulhrsw m5, [pw_1024]
21968packuswb m3, m5
21969movu [r0 + 815 * 16], m3
21970
21971; mode 15 [row 16]
21972pmaddubsw m5, m2, [r5 + 31 * 16]
21973pmulhrsw m5, [pw_1024]
21974packuswb m5, m5
21975movh [r0 + 864 * 16 + 8], m5
21976pmaddubsw m3, m1, [r5 + 31 * 16]
21977pmulhrsw m3, [pw_1024]
21978pmaddubsw m5, m4, [r5 + 31 * 16]
21979pmulhrsw m5, [pw_1024]
21980packuswb m3, m5
21981movu [r0 + 865 * 16], m3
21982
21983; mode 15 [row 17]
21984pmaddubsw m5, m2, [r5 + 14 * 16]
21985pmulhrsw m5, [pw_1024]
21986packuswb m5, m5
21987movh [r0 + 866 * 16 + 8], m5
21988pmaddubsw m3, m1, [r5 + 14 * 16]
21989pmulhrsw m3, [pw_1024]
21990pmaddubsw m5, m4, [r5 + 14 * 16]
21991pmulhrsw m5, [pw_1024]
21992packuswb m3, m5
21993movu [r0 + 867 * 16], m3
21994
21995; mode 14 [row 24]
21996pslldq m7, 2
21997pinsrb m7, [r3 + 22], 1
21998pinsrb m7, [r3 + 25], 0
21999pmaddubsw m3, m7, [r5 + 27 * 16]
22000pmulhrsw m3, [pw_1024]
22001pslldq m2, 2
22002pinsrb m2, [r3 + 2], 1
22003pinsrb m2, [r3 + 5], 0
22004pmaddubsw m5, m2, [r5 + 27 * 16]
22005pmulhrsw m5, [pw_1024]
22006packuswb m3, m5
22007movu [r0 + 816 * 16], m3
22008pslldq m1, 2
22009pinsrw m1, [r4 + 6], 0
22010pmaddubsw m3, m1, [r5 + 27 * 16]
22011pmulhrsw m3, [pw_1024]
22012pslldq m4, 2
22013pinsrw m4, [r4 + 14], 0
22014pmaddubsw m5, m4, [r5 + 27 * 16]
22015pmulhrsw m5, [pw_1024]
22016packuswb m3, m5
22017movu [r0 + 817 * 16], m3
22018
22019; mode 14 [row 25]
22020pmaddubsw m3, m7, [r5 + 14 * 16]
22021pmulhrsw m3, [pw_1024]
22022pmaddubsw m5, m2, [r5 + 14 * 16]
22023pmulhrsw m5, [pw_1024]
22024packuswb m3, m5
22025movu [r0 + 818 * 16], m3
22026pmaddubsw m3, m1, [r5 + 14 * 16]
22027pmulhrsw m3, [pw_1024]
22028pmaddubsw m5, m4, [r5 + 14 * 16]
22029pmulhrsw m5, [pw_1024]
22030packuswb m3, m5
22031movu [r0 + 819 * 16], m3
22032
22033; mode 14 [row 26]
22034pmaddubsw m3, m7, [r5 + 1 * 16]
22035pmulhrsw m3, [pw_1024]
22036pmaddubsw m5, m2, [r5 + 1 * 16]
22037pmulhrsw m5, [pw_1024]
22038packuswb m3, m5
22039movu [r0 + 820 * 16], m3
22040pmaddubsw m3, m1, [r5 + 1 * 16]
22041pmulhrsw m3, [pw_1024]
22042pmaddubsw m5, m4, [r5 + 1 * 16]
22043pmulhrsw m5, [pw_1024]
22044packuswb m3, m5
22045movu [r0 + 821 * 16], m3
22046
22047; mode 15 [row 18]
22048pinsrb m2, [r3 + 4], 0
22049pmaddubsw m5, m2, [r5 + 29 * 16]
22050pmulhrsw m5, [pw_1024]
22051packuswb m5, m5
22052movh [r0 + 868 * 16 + 8], m5
22053pmaddubsw m3, m1, [r5 + 29 * 16]
22054pmulhrsw m3, [pw_1024]
22055pmaddubsw m5, m4, [r5 + 29 * 16]
22056pmulhrsw m5, [pw_1024]
22057packuswb m3, m5
22058movu [r0 + 869 * 16], m3
22059
22060; mode 15 [row 19]
22061pmaddubsw m5, m2, [r5 + 12 * 16]
22062pmulhrsw m5, [pw_1024]
22063packuswb m5, m5
22064movh [r0 + 870 * 16 + 8], m5
22065pmaddubsw m3, m1, [r5 + 12 * 16]
22066pmulhrsw m3, [pw_1024]
22067pmaddubsw m5, m4, [r5 + 12 * 16]
22068pmulhrsw m5, [pw_1024]
22069packuswb m3, m5
22070movu [r0 + 871 * 16], m3
22071
22072; mode 15 [row 20 - 8 to 15]
22073pslldq m3, m2, 2
22074pinsrb m3, [r3 + 4], 1
22075pinsrb m3, [r3 + 6], 0
22076pmaddubsw m5, m3, [r5 + 27 * 16]
22077pmulhrsw m5, [pw_1024]
22078packuswb m5, m5
22079movh [r0 + 872 * 16 + 8], m5
22080
22081; mode 15 [row 21 - 8 to 15]
22082pmaddubsw m5, m3, [r5 + 10 * 16]
22083pmulhrsw m5, [pw_1024]
22084packuswb m5, m5
22085movh [r0 + 874 * 16 + 8], m5
22086
22087; mode 15 [row 22 - 8 to 15]
22088pslldq m3, 2
22089pinsrb m3, [r3 + 6], 1
22090pinsrb m3, [r3 + 8], 0
22091pmaddubsw m5, m3, [r5 + 25 * 16]
22092pmulhrsw m5, [pw_1024]
22093packuswb m5, m5
22094movh [r0 + 876 * 16 + 8], m5
22095
22096; mode 15 [row 23 - 8 to 15]
22097pmaddubsw m5, m3, [r5 + 8 * 16]
22098pmulhrsw m5, [pw_1024]
22099packuswb m5, m5
22100movh [r0 + 878 * 16 + 8], m5
22101
22102; mode 15 [row 24 - 8 to 15]
22103pslldq m3, 2
22104pinsrb m3, [r3 + 8], 1
22105pinsrb m3, [r3 + 9], 0
22106pmaddubsw m5, m3, [r5 + 23 * 16]
22107pmulhrsw m5, [pw_1024]
22108packuswb m5, m5
22109movh [r0 + 880 * 16 + 8], m5
22110
22111; mode 15 [row 25 - 8 to 15]
22112pmaddubsw m5, m3, [r5 + 6 * 16]
22113pmulhrsw m5, [pw_1024]
22114packuswb m5, m5
22115movh [r0 + 882 * 16 + 8], m5
22116
22117; mode 15 [row 26 - 8 to 15]
22118pslldq m3, 2
22119pinsrb m3, [r3 + 9], 1
22120pinsrb m3, [r3 + 11], 0
22121pmaddubsw m5, m3, [r5 + 21 * 16]
22122pmulhrsw m5, [pw_1024]
22123packuswb m5, m5
22124movh [r0 + 884 * 16 + 8], m5
22125
22126; mode 15 [row 27 - 8 to 15]
22127pmaddubsw m5, m3, [r5 + 4 * 16]
22128pmulhrsw m5, [pw_1024]
22129packuswb m5, m5
22130movh [r0 + 886 * 16 + 8], m5
22131
22132; mode 15 [row 28 - 8 to 15]
22133pslldq m3, 2
22134pinsrb m3, [r3 + 11], 1
22135pinsrb m3, [r3 + 13], 0
22136pmaddubsw m5, m3, [r5 + 19 * 16]
22137pmulhrsw m5, [pw_1024]
22138packuswb m5, m5
22139movh [r0 + 888 * 16 + 8], m5
22140
22141; mode 15 [row 29 - 8 to 15]
22142pmaddubsw m5, m3, [r5 + 2 * 16]
22143pmulhrsw m5, [pw_1024]
22144packuswb m5, m5
22145movh [r0 + 890 * 16 + 8], m5
22146
22147; mode 15 [row 30 - 8 to 15]
22148pslldq m3, 2
22149pinsrb m3, [r3 + 13], 1
22150pinsrb m3, [r3 + 15], 0
22151pmaddubsw m5, m3, [r5 + 17 * 16]
22152pmulhrsw m5, [pw_1024]
22153packuswb m5, m5
22154movh [r0 + 892 * 16 + 8], m5
22155
22156; mode 15 [row 31, 8 to 15]
22157pshufb m5, m3, [tab_S2]
22158movh [r0 + 894 * 16 + 8], m5
22159
22160; mode 14 [row 27]
22161pinsrb m2, [r3 + 5], 0
22162pslldq m7, 2
22163pinsrb m7, [r3 + 25], 1
22164pinsrb m7, [r3 + 27], 0
22165pmaddubsw m3, m7, [r5 + 20 * 16]
22166pmulhrsw m3, [pw_1024]
22167pslldq m2, 2
22168pinsrb m2, [r3 + 5], 1
22169pinsrb m2, [r3 + 7], 0
22170pmaddubsw m5, m2, [r5 + 20 * 16]
22171pmulhrsw m5, [pw_1024]
22172packuswb m3, m5
22173movu [r0 + 822 * 16], m3
22174pslldq m1, 2
22175pinsrw m1, [r4 + 5], 0
22176pmaddubsw m3, m1, [r5 + 20 * 16]
22177pmulhrsw m3, [pw_1024]
22178pslldq m4, 2
22179pinsrw m4, [r4 + 13], 0
22180pmaddubsw m5, m4, [r5 + 20 * 16]
22181pmulhrsw m5, [pw_1024]
22182packuswb m3, m5
22183movu [r0 + 823 * 16], m3
22184
22185; mode 15 [row 20 - 16 to 31]
22186pmaddubsw m3, m1, [r5 + 27 * 16]
22187pmulhrsw m3, [pw_1024]
22188pmaddubsw m5, m4, [r5 + 27 * 16]
22189pmulhrsw m5, [pw_1024]
22190packuswb m3, m5
22191movu [r0 + 873 * 16], m3
22192
22193; mode 15 [row 21 - 16 to 31]
22194pmaddubsw m3, m1, [r5 + 10 * 16]
22195pmulhrsw m3, [pw_1024]
22196pmaddubsw m5, m4, [r5 + 10 * 16]
22197pmulhrsw m5, [pw_1024]
22198packuswb m3, m5
22199movu [r0 + 875 * 16], m3
22200
22201; mode 14 [row 28]
22202pmaddubsw m3, m7, [r5 + 7 * 16]
22203pmulhrsw m3, [pw_1024]
22204pmaddubsw m5, m2, [r5 + 7 * 16]
22205pmulhrsw m5, [pw_1024]
22206packuswb m3, m5
22207movu [r0 + 824 * 16], m3
22208pmaddubsw m3, m1, [r5 + 7 * 16]
22209pmulhrsw m3, [pw_1024]
22210pmaddubsw m5, m4, [r5 + 7 * 16]
22211pmulhrsw m5, [pw_1024]
22212packuswb m3, m5
22213movu [r0 + 825 * 16], m3
22214
22215; mode 14 [row 29]
22216pslldq m7, 2
22217pinsrb m7, [r3 + 27], 1
22218pinsrb m7, [r3 + 30], 0
22219pmaddubsw m3, m7, [r5 + 26 * 16]
22220pmulhrsw m3, [pw_1024]
22221pslldq m2, 2
22222pinsrb m2, [r3 + 7], 1
22223pinsrb m2, [r3 + 10], 0
22224pmaddubsw m5, m2, [r5 + 26 * 16]
22225pmulhrsw m5, [pw_1024]
22226packuswb m3, m5
22227movu [r0 + 826 * 16], m3
22228pslldq m1, 2
22229pinsrw m1, [r4 + 4], 0
22230pmaddubsw m3, m1, [r5 + 26 * 16]
22231pmulhrsw m3, [pw_1024]
22232pslldq m4, 2
22233pinsrw m4, [r4 + 12], 0
22234pmaddubsw m5, m4, [r5 + 26 * 16]
22235pmulhrsw m5, [pw_1024]
22236packuswb m3, m5
22237movu [r0 + 827 * 16], m3
22238
22239; mode 14 [row 30]
22240pmaddubsw m3, m7, [r5 + 13 * 16]
22241pmulhrsw m3, [pw_1024]
22242pmaddubsw m5, m2, [r5 + 13 * 16]
22243pmulhrsw m5, [pw_1024]
22244packuswb m3, m5
22245movu [r0 + 828 * 16], m3
22246pmaddubsw m3, m1, [r5 + 13 * 16]
22247pmulhrsw m3, [pw_1024]
22248pmaddubsw m5, m4, [r5 + 13 * 16]
22249pmulhrsw m5, [pw_1024]
22250packuswb m3, m5
22251movu [r0 + 829 * 16], m3
22252
22253; mode 15 [row 22]
22254pmaddubsw m3, m1, [r5 + 25 * 16]
22255pmulhrsw m3, [pw_1024]
22256pmaddubsw m5, m4, [r5 + 25 * 16]
22257pmulhrsw m5, [pw_1024]
22258packuswb m3, m5
22259movu [r0 + 877 * 16], m3
22260
22261; mode 15 [row 23]
22262pmaddubsw m3, m1, [r5 + 8 * 16]
22263pmulhrsw m3, [pw_1024]
22264pmaddubsw m5, m4, [r5 + 8 * 16]
22265pmulhrsw m5, [pw_1024]
22266packuswb m3, m5
22267movu [r0 + 879 * 16], m3
22268
22269; mode 14 [row 31]
22270pshufb m3, m7, [tab_S2]
22271movh [r0 + 830 * 16], m3
22272pshufb m3, m2, [tab_S2]
22273movh [r0 + 830 * 16 + 8], m3
22274pshufb m3, m1, [tab_S2]
22275movh [r0 + 831 * 16], m3
22276pshufb m3, m4, [tab_S2]
22277movh [r0 + 831 * 16 + 8], m3
22278
22279; mode 13 [row 31]
22280pshufb m0, m6, [tab_S2]
22281movh [r0 + 766 * 16], m0
22282movh m0, [r4]
22283movh [r0 + 766 * 16 + 8], m0
22284movu m0, [r4 + 8]
22285movu [r0 + 767 * 16], m0
22286
22287; mode 15 [row 24]
22288pslldq m1, 2
22289pinsrw m1, [r4 + 3], 0
22290pmaddubsw m3, m1, [r5 + 23 * 16]
22291pmulhrsw m3, [pw_1024]
22292pslldq m4, 2
22293pinsrw m4, [r4 + 11], 0
22294pmaddubsw m5, m4, [r5 + 23 * 16]
22295pmulhrsw m5, [pw_1024]
22296packuswb m3, m5
22297movu [r0 + 881 * 16], m3
22298
22299; mode 15 [row 25]
22300pmaddubsw m3, m1, [r5 + 6 * 16]
22301pmulhrsw m3, [pw_1024]
22302pmaddubsw m5, m4, [r5 + 6 * 16]
22303pmulhrsw m5, [pw_1024]
22304packuswb m3, m5
22305movu [r0 + 883 * 16], m3
22306
22307; mode 15 [row 26]
22308pslldq m1, 2
22309pinsrw m1, [r4 + 2], 0
22310pmaddubsw m3, m1, [r5 + 21 * 16]
22311pmulhrsw m3, [pw_1024]
22312pslldq m4, 2
22313pinsrw m4, [r4 + 10], 0
22314pmaddubsw m5, m4, [r5 + 21 * 16]
22315pmulhrsw m5, [pw_1024]
22316packuswb m3, m5
22317movu [r0 + 885 * 16], m3
22318
22319; mode 15 [row 27]
22320pmaddubsw m3, m1, [r5 + 4 * 16]
22321pmulhrsw m3, [pw_1024]
22322pmaddubsw m5, m4, [r5 + 4 * 16]
22323pmulhrsw m5, [pw_1024]
22324packuswb m3, m5
22325movu [r0 + 887 * 16], m3
22326
22327; mode 15 [row 28]
22328pslldq m1, 2
22329pinsrw m1, [r4 + 1], 0
22330pmaddubsw m3, m1, [r5 + 19 * 16]
22331pmulhrsw m3, [pw_1024]
22332pslldq m4, 2
22333pinsrw m4, [r4 + 9], 0
22334pmaddubsw m5, m4, [r5 + 19 * 16]
22335pmulhrsw m5, [pw_1024]
22336packuswb m3, m5
22337movu [r0 + 889 * 16], m3
22338
22339; mode 15 [row 29]
22340pmaddubsw m3, m1, [r5 + 2 * 16]
22341pmulhrsw m3, [pw_1024]
22342pmaddubsw m5, m4, [r5 + 2 * 16]
22343pmulhrsw m5, [pw_1024]
22344packuswb m3, m5
22345movu [r0 + 891 * 16], m3
22346
22347; mode 15 [row 30]
22348pslldq m1, 2
22349pinsrw m1, [r4 + 0], 0
22350pmaddubsw m3, m1, [r5 + 17 * 16]
22351pmulhrsw m3, [pw_1024]
22352pslldq m4, 2
22353pinsrw m4, [r4 + 8], 0
22354pmaddubsw m5, m4, [r5 + 17 * 16]
22355pmulhrsw m5, [pw_1024]
22356packuswb m3, m5
22357movu [r0 + 893 * 16], m3
22358
22359; mode 15 [row 31]
22360pshufb m5, m1, [tab_S2]
22361movh [r0 + 895 * 16], m5
22362pshufb m5, m4, [tab_S2]
22363movh [r0 + 895 * 16 + 8], m5
22364
22365; mode 16 [row 0]
22366movu m6, [r5 + 11 * 16]
22367movu m7, [pw_1024]
22368movh m0, [r4 ]
22369movh m1, [r4 + 1 ]
22370punpcklbw m0, m1
22371pmaddubsw m1, m0, m6
22372pmulhrsw m1, m7
22373movh m2, [r4 + 8]
22374movh m3, [r4 + 9]
22375punpcklbw m2, m3
22376pmaddubsw m3, m2, m6
22377pmulhrsw m3, m7
22378packuswb m1, m3
22379movu [r0 + 896 * 16], m1
22380
22381movh m1, [r4 + 16]
22382movh m3, [r4 + 17]
22383punpcklbw m1, m3
22384pmaddubsw m3, m1, m6
22385pmulhrsw m3, m7
22386movh m4, [r4 + 24]
22387movh m5, [r4 + 25]
22388punpcklbw m4, m5
22389pmaddubsw m5, m4, m6
22390pmulhrsw m5, m7
22391packuswb m3, m5
22392movu [r0 + 897 * 16], m3
22393
22394; mode16 [row 1]
22395movu m6, [r5 + 22 * 16]
22396pslldq m0, 2
22397pinsrb m0, [r4], 1
22398pinsrb m0, [r3 + 2], 0
22399pmaddubsw m3, m0, m6
22400pmulhrsw m3, m7
22401pslldq m2, 2
22402pinsrw m2, [r4 + 7], 0
22403pmaddubsw m5, m2, m6
22404pmulhrsw m5, m7
22405packuswb m3, m5
22406movu [r0 + 898 * 16], m3
22407
22408pslldq m1, 2
22409pinsrw m1, [r4 + 15], 0
22410pmaddubsw m3, m1, m6
22411pmulhrsw m3, m7
22412pslldq m4, 2
22413pinsrw m4, [r4 + 23], 0
22414pmaddubsw m5, m4, m6
22415pmulhrsw m5, m7
22416packuswb m3, m5
22417movu [r0 + 899 * 16], m3
22418
22419; mode16 [row 2]
22420movu m6, [r5 + 1 * 16]
22421pmaddubsw m3, m0, m6
22422pmulhrsw m3, m7
22423pmaddubsw m5, m2, m6
22424pmulhrsw m5, m7
22425packuswb m3, m5
22426movu [r0 + 900 * 16], m3
22427
22428pmaddubsw m3, m1, m6
22429pmulhrsw m3, m7
22430pmaddubsw m5, m4, m6
22431pmulhrsw m5, m7
22432packuswb m3, m5
22433movu [r0 + 901 * 16], m3
22434
22435; mode16 [row 3]
22436movu m6, [r5 + 12 * 16]
22437pslldq m0, 2
22438pinsrb m0, [r3 + 2], 1
22439pinsrb m0, [r3 + 3], 0
22440pmaddubsw m3, m0, m6
22441pmulhrsw m3, m7
22442pslldq m2, 2
22443pinsrw m2, [r4 + 6], 0
22444pmaddubsw m5, m2, m6
22445pmulhrsw m5, m7
22446packuswb m3, m5
22447movu [r0 + 902 * 16], m3
22448
22449pslldq m1, 2
22450pinsrw m1, [r4 + 14], 0
22451pmaddubsw m3, m1, m6
22452pmulhrsw m3, m7
22453pslldq m4, 2
22454pinsrw m4, [r4 + 22], 0
22455pmaddubsw m5, m4, m6
22456pmulhrsw m5, m7
22457packuswb m3, m5
22458movu [r0 + 903 * 16], m3
22459
22460; mode16 [row 4]
22461movu m6, [r5 + 23 * 16]
22462pslldq m0, 2
22463pinsrb m0, [r3 + 3], 1
22464pinsrb m0, [r3 + 5], 0
22465pmaddubsw m3, m0, m6
22466pmulhrsw m3, m7
22467pslldq m2, 2
22468pinsrw m2, [r4 + 5], 0
22469pmaddubsw m5, m2, m6
22470pmulhrsw m5, m7
22471packuswb m3, m5
22472movu [r0 + 904 * 16], m3
22473
22474pslldq m1, 2
22475pinsrw m1, [r4 + 13], 0
22476pmaddubsw m3, m1, m6
22477pmulhrsw m3, m7
22478pslldq m4, 2
22479pinsrw m4, [r4 + 21], 0
22480pmaddubsw m5, m4, m6
22481pmulhrsw m5, m7
22482packuswb m3, m5
22483movu [r0 + 905 * 16], m3
22484
22485; mode16 [row 5]
22486movu m6, [r5 + 2 * 16]
22487pmaddubsw m3, m0, m6
22488pmulhrsw m3, m7
22489pmaddubsw m5, m2, m6
22490pmulhrsw m5, m7
22491packuswb m3, m5
22492movu [r0 + 906 * 16], m3
22493
22494pmaddubsw m3, m1, m6
22495pmulhrsw m3, m7
22496pmaddubsw m5, m4, m6
22497pmulhrsw m5, m7
22498packuswb m3, m5
22499movu [r0 + 907 * 16], m3
22500
22501; mode16 [row 6]
22502movu m6, [r5 + 13 * 16]
22503pslldq m0, 2
22504pinsrb m0, [r3 + 5], 1
22505pinsrb m0, [r3 + 6], 0
22506pmaddubsw m3, m0, m6
22507pmulhrsw m3, m7
22508pslldq m2, 2
22509pinsrb m2, [r4 + 5], 1
22510pinsrb m2, [r4 + 4], 0
22511pmaddubsw m5, m2, m6
22512pmulhrsw m5, m7
22513packuswb m3, m5
22514movu [r0 + 908 * 16], m3
22515pslldq m1, 2
22516pinsrw m1, [r4 + 12], 0
22517pmaddubsw m3, m1, m6
22518pmulhrsw m3, m7
22519pslldq m4, 2
22520pinsrw m4, [r4 + 20], 0
22521pmaddubsw m5, m4, m6
22522pmulhrsw m5, m7
22523packuswb m3, m5
22524movu [r0 + 909 * 16], m3
22525
22526; mode16 [row 7]
22527movu m6, [r5 + 24 * 16]
22528pslldq m0, 2
22529pinsrb m0, [r3 + 6], 1
22530pinsrb m0, [r3 + 8], 0
22531pmaddubsw m3, m0, m6
22532pmulhrsw m3, m7
22533pslldq m2, 2
22534pinsrw m2, [r4 + 3], 0
22535pmaddubsw m5, m2, m6
22536pmulhrsw m5, m7
22537packuswb m3, m5
22538movu [r0 + 910 * 16], m3
22539
22540pslldq m1, 2
22541pinsrw m1, [r4 + 11], 0
22542pmaddubsw m3, m1, m6
22543pmulhrsw m3, m7
22544pslldq m4, 2
22545pinsrw m4, [r4 + 19], 0
22546pmaddubsw m5, m4, m6
22547pmulhrsw m5, m7
22548packuswb m3, m5
22549movu [r0 + 911 * 16], m3
22550
22551; mode16 [row 8]
22552movu m6, [r5 + 3 * 16]
22553pmaddubsw m3, m0, m6
22554pmulhrsw m3, m7
22555pmaddubsw m5, m2, m6
22556pmulhrsw m5, m7
22557packuswb m3, m5
22558movu [r0 + 912 * 16], m3
22559
22560pmaddubsw m3, m1, m6
22561pmulhrsw m3, m7
22562pmaddubsw m5, m4, m6
22563pmulhrsw m5, m7
22564packuswb m3, m5
22565movu [r0 + 913 * 16], m3
22566
22567; mode16 [row 9]
22568movu m6, [r5 + 14 * 16]
22569pslldq m0, 2
22570pinsrb m0, [r3 + 8], 1
22571pinsrb m0, [r3 + 9], 0
22572pmaddubsw m3, m0, m6
22573pmulhrsw m3, m7
22574pslldq m2, 2
22575pinsrw m2, [r4 + 2], 0
22576pmaddubsw m5, m2, m6
22577pmulhrsw m5, m7
22578packuswb m3, m5
22579movu [r0 + 914 * 16], m3
22580
22581pslldq m1, 2
22582pinsrw m1, [r4 + 10], 0
22583pmaddubsw m3, m1, m6
22584pmulhrsw m3, m7
22585pslldq m4, 2
22586pinsrw m4, [r4 + 18], 0
22587pmaddubsw m5, m4, m6
22588pmulhrsw m5, m7
22589packuswb m3, m5
22590movu [r0 + 915 * 16], m3
22591
22592; mode16 [row 10]
22593movu m6, [r5 + 25 * 16]
22594pslldq m0, 2
22595pinsrb m0, [r3 + 9], 1
22596pinsrb m0, [r3 + 11], 0
22597pmaddubsw m3, m0, m6
22598pmulhrsw m3, m7
22599pslldq m2, 2
22600pinsrw m2, [r4 + 1], 0
22601pmaddubsw m5, m2, m6
22602pmulhrsw m5, m7
22603packuswb m3, m5
22604movu [r0 + 916 * 16], m3
22605
22606pslldq m1, 2
22607pinsrw m1, [r4 + 9], 0
22608pmaddubsw m3, m1, m6
22609pmulhrsw m3, m7
22610pslldq m4, 2
22611pinsrb m4, [r4 + 18], 1
22612pinsrb m4, [r4 + 17], 0
22613pmaddubsw m5, m4, m6
22614pmulhrsw m5, m7
22615packuswb m3, m5
22616movu [r0 + 917 * 16], m3
22617
22618; mode16 [row 11]
22619movu m6, [r5 + 4 * 16]
22620pmaddubsw m3, m0, m6
22621pmulhrsw m3, m7
22622pmaddubsw m5, m2, m6
22623pmulhrsw m5, m7
22624packuswb m3, m5
22625movu [r0 + 918 * 16], m3
22626
22627pmaddubsw m3, m1, m6
22628pmulhrsw m3, m7
22629pmaddubsw m5, m4, m6
22630pmulhrsw m5, m7
22631packuswb m3, m5
22632movu [r0 + 919 * 16], m3
22633
22634; mode16 [row 12]
22635movu m6, [r5 + 15 * 16]
22636pslldq m0, 2
22637pinsrb m0, [r3 + 11], 1
22638pinsrb m0, [r3 + 12], 0
22639pmaddubsw m3, m0, m6
22640pmulhrsw m3, m7
22641pslldq m2, 2
22642pinsrw m2, [r4 + 0], 0
22643pmaddubsw m5, m2, m6
22644pmulhrsw m5, m7
22645packuswb m3, m5
22646movu [r0 + 920 * 16], m3
22647
22648pslldq m1, 2
22649pinsrw m1, [r4 + 8], 0
22650pmaddubsw m3, m1, m6
22651pmulhrsw m3, m7
22652pslldq m4, 2
22653pinsrw m4, [r4 + 16], 0
22654pmaddubsw m5, m4, m6
22655pmulhrsw m5, m7
22656packuswb m3, m5
22657movu [r0 + 921 * 16], m3
22658
22659; mode16 [row 13]
22660movu m6, [r5 + 26 * 16]
22661pslldq m0, 2
22662pinsrb m0, [r3 + 12], 1
22663pinsrb m0, [r3 + 14], 0
22664pmaddubsw m3, m0, m6
22665pmulhrsw m3, m7
22666pslldq m2, 2
22667pinsrb m2, [r4 + 0], 1
22668pinsrb m2, [r3 + 2], 0
22669pmaddubsw m5, m2, m6
22670pmulhrsw m5, m7
22671packuswb m3, m5
22672movu [r0 + 922 * 16], m3
22673
22674pslldq m1, 2
22675pinsrw m1, [r4 + 7], 0
22676pmaddubsw m3, m1, m6
22677pmulhrsw m3, m7
22678pslldq m4, 2
22679pinsrw m4, [r4 + 15], 0
22680pmaddubsw m5, m4, m6
22681pmulhrsw m5, m7
22682packuswb m3, m5
22683movu [r0 + 923 * 16], m3
22684
22685; mode16 [row 14]
22686movu m6, [r5 + 5 * 16]
22687pmaddubsw m3, m0, m6
22688pmulhrsw m3, m7
22689pmaddubsw m5, m2, m6
22690pmulhrsw m5, m7
22691packuswb m3, m5
22692movu [r0 + 924 * 16], m3
22693
22694pmaddubsw m3, m1, m6
22695pmulhrsw m3, m7
22696pmaddubsw m5, m4, m6
22697pmulhrsw m5, m7
22698packuswb m3, m5
22699movu [r0 + 925 * 16], m3
22700
22701; mode16 [row 15]
22702movu m6, [r5 + 16 * 16]
22703pslldq m0, 2
22704pinsrb m0, [r3 + 14], 1
22705pinsrb m0, [r3 + 15], 0
22706pmaddubsw m3, m0, m6
22707pmulhrsw m3, m7
22708pslldq m2, 2
22709pinsrb m2, [r3 + 2], 1
22710pinsrb m2, [r3 + 3], 0
22711pmaddubsw m5, m2, m6
22712pmulhrsw m5, m7
22713packuswb m3, m5
22714movu [r0 + 926 * 16], m3
22715
22716pslldq m1, 2
22717pinsrw m1, [r4 + 6], 0
22718pmaddubsw m3, m1, m6
22719pmulhrsw m3, m7
22720pslldq m4, 2
22721pinsrw m4, [r4 + 14], 0
22722pmaddubsw m5, m4, m6
22723pmulhrsw m5, m7
22724packuswb m3, m5
22725movu [r0 + 927 * 16], m3
22726
22727; mode16 [row 16]
22728movu m6, [r5 + 27 * 16]
22729pslldq m0, 2
22730pinsrb m0, [r3 + 15], 1
22731pinsrb m0, [r3 + 17], 0
22732pmaddubsw m3, m0, m6
22733pmulhrsw m3, m7
22734pslldq m2, 2
22735pinsrb m2, [r3 + 3], 1
22736pinsrb m2, [r3 + 5], 0
22737pmaddubsw m5, m2, m6
22738pmulhrsw m5, m7
22739packuswb m3, m5
22740movu [r0 + 928 * 16], m3
22741
22742pslldq m1, 2
22743pinsrw m1, [r4 + 5], 0
22744pmaddubsw m3, m1, m6
22745pmulhrsw m3, m7
22746pslldq m4, 2
22747pinsrw m4, [r4 + 13], 0
22748pmaddubsw m5, m4, m6
22749pmulhrsw m5, m7
22750packuswb m3, m5
22751movu [r0 + 929 * 16], m3
22752
22753; mode16 [row 17]
22754movu m6, [r5 + 6 * 16]
22755pmaddubsw m3, m0, m6
22756pmulhrsw m3, m7
22757pmaddubsw m5, m2, m6
22758pmulhrsw m5, m7
22759packuswb m3, m5
22760movu [r0 + 930 * 16], m3
22761
22762pmaddubsw m3, m1, m6
22763pmulhrsw m3, m7
22764pmaddubsw m5, m4, m6
22765pmulhrsw m5, m7
22766packuswb m3, m5
22767movu [r0 + 931 * 16], m3
22768
22769; mode16 [row 18]
22770movu m6, [r5 + 17 * 16]
22771pslldq m0, 2
22772pinsrb m0, [r3 + 17], 1
22773pinsrb m0, [r3 + 18], 0
22774pmaddubsw m3, m0, m6
22775pmulhrsw m3, m7
22776pslldq m2, 2
22777pinsrb m2, [r3 + 5], 1
22778pinsrb m2, [r3 + 6], 0
22779pmaddubsw m5, m2, m6
22780pmulhrsw m5, m7
22781packuswb m3, m5
22782movu [r0 + 932 * 16], m3
22783
22784pslldq m1, 2
22785pinsrw m1, [r4 + 4], 0
22786pmaddubsw m3, m1, m6
22787pmulhrsw m3, m7
22788pslldq m4, 2
22789pinsrw m4, [r4 + 12], 0
22790pmaddubsw m5, m4, m6
22791pmulhrsw m5, m7
22792packuswb m3, m5
22793movu [r0 + 933 * 16], m3
22794
22795; mode16 [row 19]
22796movu m6, [r5 + 28 * 16]
22797pslldq m0, 2
22798pinsrb m0, [r3 + 18], 1
22799pinsrb m0, [r3 + 20], 0
22800pmaddubsw m3, m0, m6
22801pmulhrsw m3, m7
22802pslldq m2, 2
22803pinsrb m2, [r3 + 6], 1
22804pinsrb m2, [r3 + 8], 0
22805pmaddubsw m5, m2, m6
22806pmulhrsw m5, m7
22807packuswb m3, m5
22808movu [r0 + 934 * 16], m3
22809
22810pslldq m1, 2
22811pinsrw m1, [r4 + 3], 0
22812pmaddubsw m3, m1, m6
22813pmulhrsw m3, m7
22814pslldq m4, 2
22815pinsrw m4, [r4 + 11], 0
22816pmaddubsw m5, m4, m6
22817pmulhrsw m5, m7
22818packuswb m3, m5
22819movu [r0 + 935 * 16], m3
22820
22821; mode16 [row 20]
22822movu m6, [r5 + 7 * 16]
22823pmaddubsw m3, m0, m6
22824pmulhrsw m3, m7
22825pmaddubsw m5, m2, m6
22826pmulhrsw m5, m7
22827packuswb m3, m5
22828movu [r0 + 936 * 16], m3
22829
22830pmaddubsw m3, m1, m6
22831pmulhrsw m3, m7
22832pmaddubsw m5, m4, m6
22833pmulhrsw m5, m7
22834packuswb m3, m5
22835movu [r0 + 937 * 16], m3
22836
22837; mode16 [row 21]
22838movu m6, [r5 + 18 * 16]
22839pslldq m0, 2
22840pinsrb m0, [r3 + 20], 1
22841pinsrb m0, [r3 + 21], 0
22842pmaddubsw m3, m0, m6
22843pmulhrsw m3, m7
22844pslldq m2, 2
22845pinsrb m2, [r3 + 8], 1
22846pinsrb m2, [r3 + 9], 0
22847pmaddubsw m5, m2, m6
22848pmulhrsw m5, m7
22849packuswb m3, m5
22850movu [r0 + 938 * 16], m3
22851
22852pslldq m1, 2
22853pinsrw m1, [r4 + 2], 0
22854pmaddubsw m3, m1, m6
22855pmulhrsw m3, m7
22856pslldq m4, 2
22857pinsrw m4, [r4 + 10], 0
22858pmaddubsw m5, m4, m6
22859pmulhrsw m5, m7
22860packuswb m3, m5
22861movu [r0 + 939 * 16], m3
22862
22863; mode16 [row 22]
22864movu m6, [r5 + 29 * 16]
22865pslldq m0, 2
22866pinsrb m0, [r3 + 21], 1
22867pinsrb m0, [r3 + 23], 0
22868pmaddubsw m3, m0, m6
22869pmulhrsw m3, m7
22870pslldq m2, 2
22871pinsrb m2, [r3 + 9], 1
22872pinsrb m2, [r3 + 11], 0
22873pmaddubsw m5, m2, m6
22874pmulhrsw m5, m7
22875packuswb m3, m5
22876movu [r0 + 940 * 16], m3
22877
22878pslldq m1, 2
22879pinsrw m1, [r4 + 1], 0
22880pmaddubsw m3, m1, m6
22881pmulhrsw m3, m7
22882pslldq m4, 2
22883pinsrw m4, [r4 + 9], 0
22884pmaddubsw m5, m4, m6
22885pmulhrsw m5, m7
22886packuswb m3, m5
22887movu [r0 + 941 * 16], m3
22888
22889; mode16 [row 23]
22890movu m6, [r5 + 8 * 16]
22891pmaddubsw m3, m0, m6
22892pmulhrsw m3, m7
22893pmaddubsw m5, m2, m6
22894pmulhrsw m5, m7
22895packuswb m3, m5
22896movu [r0 + 942 * 16], m3
22897
22898pmaddubsw m3, m1, m6
22899pmulhrsw m3, m7
22900pmaddubsw m5, m4, m6
22901pmulhrsw m5, m7
22902packuswb m3, m5
22903movu [r0 + 943 * 16], m3
22904
22905; mode16 [row 24]
22906movu m6, [r5 + 19 * 16]
22907pslldq m0, 2
22908pinsrb m0, [r3 + 23], 1
22909pinsrb m0, [r3 + 24], 0
22910pmaddubsw m3, m0, m6
22911pmulhrsw m3, m7
22912pslldq m2, 2
22913pinsrb m2, [r3 + 11], 1
22914pinsrb m2, [r3 + 12], 0
22915pmaddubsw m5, m2, m6
22916pmulhrsw m5, m7
22917packuswb m3, m5
22918movu [r0 + 944 * 16], m3
22919
22920pslldq m1, 2
22921pinsrw m1, [r4 + 0], 0
22922pmaddubsw m3, m1, m6
22923pmulhrsw m3, m7
22924pslldq m4, 2
22925pinsrw m4, [r4 + 8], 0
22926pmaddubsw m5, m4, m6
22927pmulhrsw m5, m7
22928packuswb m3, m5
22929movu [r0 + 945 * 16], m3
22930
22931; mode16 [row 25]
22932movu m6, [r5 + 30 * 16]
22933pslldq m0, 2
22934pinsrb m0, [r3 + 24], 1
22935pinsrb m0, [r3 + 26], 0
22936pmaddubsw m3, m0, m6
22937pmulhrsw m3, m7
22938pslldq m2, 2
22939pinsrb m2, [r3 + 12], 1
22940pinsrb m2, [r3 + 14], 0
22941pmaddubsw m5, m2, m6
22942pmulhrsw m5, m7
22943packuswb m3, m5
22944movu [r0 + 946 * 16], m3
22945
22946pslldq m1, 2
22947pinsrb m1, [r4 + 0], 1
22948pinsrb m1, [r3 + 2], 0
22949pmaddubsw m3, m1, m6
22950pmulhrsw m3, m7
22951pslldq m4, 2
22952pinsrw m4, [r4 + 7], 0
22953pmaddubsw m5, m4, m6
22954pmulhrsw m5, m7
22955packuswb m3, m5
22956movu [r0 + 947 * 16], m3
22957
22958; mode16 [row 26]
22959movu m6, [r5 + 9 * 16]
22960pmaddubsw m3, m0, m6
22961pmulhrsw m3, m7
22962pmaddubsw m5, m2, m6
22963pmulhrsw m5, m7
22964packuswb m3, m5
22965movu [r0 + 948 * 16], m3
22966
22967pmaddubsw m3, m1, m6
22968pmulhrsw m3, m7
22969pmaddubsw m5, m4, m6
22970pmulhrsw m5, m7
22971packuswb m3, m5
22972movu [r0 + 949 * 16], m3
22973
22974; mode16 [row 27]
22975movu m6, [r5 + 20 * 16]
22976pslldq m0, 2
22977pinsrb m0, [r3 + 26], 1
22978pinsrb m0, [r3 + 27], 0
22979pmaddubsw m3, m0, m6
22980pmulhrsw m3, m7
22981pslldq m2, 2
22982pinsrb m2, [r3 + 14], 1
22983pinsrb m2, [r3 + 15], 0
22984pmaddubsw m5, m2, m6
22985pmulhrsw m5, m7
22986packuswb m3, m5
22987movu [r0 + 950 * 16], m3
22988
22989pslldq m1, 2
22990pinsrb m1, [r3 + 2], 1
22991pinsrb m1, [r3 + 3], 0
22992pmaddubsw m3, m1, m6
22993pmulhrsw m3, m7
22994pslldq m4, 2
22995pinsrw m4, [r4 + 6], 0
22996pmaddubsw m5, m4, m6
22997pmulhrsw m5, m7
22998packuswb m3, m5
22999movu [r0 + 951 * 16], m3
23000
23001; mode16 [row 28]
23002movu m6, [r5 + 31 * 16]
23003pslldq m0, 2
23004pinsrb m0, [r3 + 27], 1
23005pinsrb m0, [r3 + 29], 0
23006pmaddubsw m3, m0, m6
23007pmulhrsw m3, m7
23008pslldq m2, 2
23009pinsrb m2, [r3 + 15], 1
23010pinsrb m2, [r3 + 17], 0
23011pmaddubsw m5, m2, m6
23012pmulhrsw m5, m7
23013packuswb m3, m5
23014movu [r0 + 952 * 16], m3
23015
23016pslldq m1, 2
23017pinsrb m1, [r3 + 3], 1
23018pinsrb m1, [r3 + 5], 0
23019pmaddubsw m3, m1, m6
23020pmulhrsw m3, m7
23021pslldq m4, 2
23022pinsrw m4, [r4 + 5], 0
23023pmaddubsw m5, m4, m6
23024pmulhrsw m5, m7
23025packuswb m3, m5
23026movu [r0 + 953 * 16], m3
23027
23028; mode16 [row 29]
23029movu m6, [r5 + 10 * 16]
23030pmaddubsw m3, m0, m6
23031pmulhrsw m3, m7
23032pmaddubsw m5, m2, m6
23033pmulhrsw m5, m7
23034packuswb m3, m5
23035movu [r0 + 954 * 16], m3
23036
23037pmaddubsw m3, m1, m6
23038pmulhrsw m3, m7
23039pmaddubsw m5, m4, m6
23040pmulhrsw m5, m7
23041packuswb m3, m5
23042movu [r0 + 955 * 16], m3
23043
23044; mode16 [row 30]
23045movu m6, [r5 + 21 * 16]
23046pslldq m0, 2
23047pinsrb m0, [r3 + 29], 1
23048pinsrb m0, [r3 + 30], 0
23049pmaddubsw m3, m0, m6
23050pmulhrsw m3, m7
23051pslldq m2, 2
23052pinsrb m2, [r3 + 17], 1
23053pinsrb m2, [r3 + 18], 0
23054pmaddubsw m5, m2, m6
23055pmulhrsw m5, m7
23056packuswb m3, m5
23057movu [r0 + 956 * 16], m3
23058
23059pslldq m1, 2
23060pinsrb m1, [r3 + 5], 1
23061pinsrb m1, [r3 + 6], 0
23062pmaddubsw m3, m1, m6
23063pmulhrsw m3, m7
23064pslldq m4, 2
23065pinsrw m4, [r4 + 4], 0
23066pmaddubsw m5, m4, m6
23067pmulhrsw m5, m7
23068packuswb m3, m5
23069movu [r0 + 957 * 16], m3
23070
23071; mode16 [row 31]
23072pshufb m5, m0, [tab_S2]
23073movh [r0 + 958 * 16], m5
23074pshufb m5, m2, [tab_S2]
23075movh [r0 + 958 * 16 + 8], m5
23076pshufb m5, m1, [tab_S2]
23077movh [r0 + 959 * 16], m5
23078pshufb m5, m4, [tab_S2]
23079movh [r0 + 959 * 16 + 8], m5
23080
23081; mode 17 [row 0]
23082movu m6, [r5 + 6 * 16]
23083movu m7, [pw_1024]
23084movh m0, [r4 ]
23085movh m1, [r4 + 1 ]
23086punpcklbw m0, m1
23087pmaddubsw m1, m0, m6
23088pmulhrsw m1, m7
23089movh m2, [r4 + 8]
23090movh m3, [r4 + 9]
23091punpcklbw m2, m3
23092pmaddubsw m3, m2, m6
23093pmulhrsw m3, m7
23094packuswb m1, m3
23095movu [r0 + 960 * 16], m1
23096
23097movh m1, [r4 + 16]
23098movh m3, [r4 + 17]
23099punpcklbw m1, m3
23100pmaddubsw m3, m1, m6
23101pmulhrsw m3, m7
23102movh m4, [r4 + 24]
23103movh m5, [r4 + 25]
23104punpcklbw m4, m5
23105pmaddubsw m5, m4, m6
23106pmulhrsw m5, m7
23107packuswb m3, m5
23108movu [r0 + 961 * 16], m3
23109
23110; mode17 [row 1]
23111movu m6, [r5 + 12 * 16]
23112pslldq m0, 2
23113pinsrb m0, [r3 + 0], 1
23114pinsrb m0, [r3 + 1], 0
23115pmaddubsw m3, m0, m6
23116pmulhrsw m3, m7
23117pslldq m2, 2
23118pinsrw m2, [r4 + 7], 0
23119pmaddubsw m5, m2, m6
23120pmulhrsw m5, m7
23121packuswb m3, m5
23122movu [r0 + 962 * 16], m3
23123
23124pslldq m1, 2
23125pinsrw m1, [r4 + 15], 0
23126pmaddubsw m3, m1, m6
23127pmulhrsw m3, m7
23128pslldq m4, 2
23129pinsrw m4, [r4 + 23], 0
23130pmaddubsw m5, m4, m6
23131pmulhrsw m5, m7
23132packuswb m3, m5
23133movu [r0 + 963 * 16], m3
23134
23135; mode17 [row 2]
23136movu m6, [r5 + 18 * 16]
23137pslldq m0, 2
23138pinsrb m0, [r3 + 1], 1
23139pinsrb m0, [r3 + 2], 0
23140pmaddubsw m3, m0, m6
23141pmulhrsw m3, m7
23142pslldq m2, 2
23143pinsrw m2, [r4 + 6], 0
23144pmaddubsw m5, m2, m6
23145pmulhrsw m5, m7
23146packuswb m3, m5
23147movu [r0 + 964 * 16], m3
23148
23149pslldq m1, 2
23150pinsrw m1, [r4 + 14], 0
23151pmaddubsw m3, m1, m6
23152pmulhrsw m3, m7
23153pslldq m4, 2
23154pinsrw m4, [r4 + 22], 0
23155pmaddubsw m5, m4, m6
23156pmulhrsw m5, m7
23157packuswb m3, m5
23158movu [r0 + 965 * 16], m3
23159
23160; mode17 [row 3]
23161movu m6, [r5 + 24 * 16]
23162pslldq m0, 2
23163pinsrb m0, [r3 + 2], 1
23164pinsrb m0, [r3 + 4], 0
23165pmaddubsw m3, m0, m6
23166pmulhrsw m3, m7
23167pslldq m2, 2
23168pinsrw m2, [r4 + 5], 0
23169pmaddubsw m5, m2, m6
23170pmulhrsw m5, m7
23171packuswb m3, m5
23172movu [r0 + 966 * 16], m3
23173
23174pslldq m1, 2
23175pinsrw m1, [r4 + 13], 0
23176pmaddubsw m3, m1, m6
23177pmulhrsw m3, m7
23178pslldq m4, 2
23179pinsrw m4, [r4 + 21], 0
23180pmaddubsw m5, m4, m6
23181pmulhrsw m5, m7
23182packuswb m3, m5
23183movu [r0 + 967 * 16], m3
23184
23185; mode17 [row 4]
23186movu m6, [r5 + 30 * 16]
23187pslldq m0, 2
23188pinsrb m0, [r3 + 4], 1
23189pinsrb m0, [r3 + 5], 0
23190pmaddubsw m3, m0, m6
23191pmulhrsw m3, m7
23192pslldq m2, 2
23193pinsrw m2, [r4 + 4], 0
23194pmaddubsw m5, m2, m6
23195pmulhrsw m5, m7
23196packuswb m3, m5
23197movu [r0 + 968 * 16], m3
23198
23199pslldq m1, 2
23200pinsrw m1, [r4 + 12], 0
23201pmaddubsw m3, m1, m6
23202pmulhrsw m3, m7
23203pslldq m4, 2
23204pinsrw m4, [r4 + 20], 0
23205pmaddubsw m5, m4, m6
23206pmulhrsw m5, m7
23207packuswb m3, m5
23208movu [r0 + 969 * 16], m3
23209
23210; mode17 [row 5]
23211movu m6, [r5 + 4 * 16]
23212pmaddubsw m3, m0, m6
23213pmulhrsw m3, m7
23214pmaddubsw m5, m2, m6
23215pmulhrsw m5, m7
23216packuswb m3, m5
23217movu [r0 + 970 * 16], m3
23218
23219pmaddubsw m3, m1, m6
23220pmulhrsw m3, m7
23221pmaddubsw m5, m4, m6
23222pmulhrsw m5, m7
23223packuswb m3, m5
23224movu [r0 + 971 * 16], m3
23225
23226; mode17 [row 6]
23227movu m6, [r5 + 10 * 16]
23228pslldq m0, 2
23229pinsrb m0, [r3 + 5], 1
23230pinsrb m0, [r3 + 6], 0
23231pmaddubsw m3, m0, m6
23232pmulhrsw m3, m7
23233pslldq m2, 2
23234pinsrw m2, [r4 + 3], 0
23235pmaddubsw m5, m2, m6
23236pmulhrsw m5, m7
23237packuswb m3, m5
23238movu [r0 + 972 * 16], m3
23239
23240pslldq m1, 2
23241pinsrw m1, [r4 + 11], 0
23242pmaddubsw m3, m1, m6
23243pmulhrsw m3, m7
23244pslldq m4, 2
23245pinsrw m4, [r4 + 19], 0
23246pmaddubsw m5, m4, m6
23247pmulhrsw m5, m7
23248packuswb m3, m5
23249movu [r0 + 973 * 16], m3
23250
23251; mode17 [row 7]
23252movu m6, [r5 + 16 * 16]
23253pslldq m0, 2
23254pinsrb m0, [r3 + 6], 1
23255pinsrb m0, [r3 + 7], 0
23256pmaddubsw m3, m0, m6
23257pmulhrsw m3, m7
23258pslldq m2, 2
23259pinsrw m2, [r4 + 2], 0
23260pmaddubsw m5, m2, m6
23261pmulhrsw m5, m7
23262packuswb m3, m5
23263movu [r0 + 974 * 16], m3
23264
23265pslldq m1, 2
23266pinsrw m1, [r4 + 10], 0
23267pmaddubsw m3, m1, m6
23268pmulhrsw m3, m7
23269pslldq m4, 2
23270pinsrw m4, [r4 + 18], 0
23271pmaddubsw m5, m4, m6
23272pmulhrsw m5, m7
23273packuswb m3, m5
23274movu [r0 + 975 * 16], m3
23275
23276; mode17 [row 8]
23277movu m6, [r5 + 22 * 16]
23278pslldq m0, 2
23279pinsrb m0, [r3 + 7], 1
23280pinsrb m0, [r3 + 9], 0
23281pmaddubsw m3, m0, m6
23282pmulhrsw m3, m7
23283pslldq m2, 2
23284pinsrw m2, [r4 + 1], 0
23285pmaddubsw m5, m2, m6
23286pmulhrsw m5, m7
23287packuswb m3, m5
23288movu [r0 + 976 * 16], m3
23289
23290pslldq m1, 2
23291pinsrw m1, [r4 + 9], 0
23292pmaddubsw m3, m1, m6
23293pmulhrsw m3, m7
23294pslldq m4, 2
23295pinsrw m4, [r4 + 17], 0
23296pmaddubsw m5, m4, m6
23297pmulhrsw m5, m7
23298packuswb m3, m5
23299movu [r0 + 977 * 16], m3
23300
23301; mode17 [row 9]
23302movu m6, [r5 + 28 * 16]
23303pslldq m0, 2
23304pinsrb m0, [r3 + 9], 1
23305pinsrb m0, [r3 + 10], 0
23306pmaddubsw m3, m0, m6
23307pmulhrsw m3, m7
23308pslldq m2, 2
23309pinsrw m2, [r4 + 0], 0
23310pmaddubsw m5, m2, m6
23311pmulhrsw m5, m7
23312packuswb m3, m5
23313movu [r0 + 978 * 16], m3
23314
23315pslldq m1, 2
23316pinsrw m1, [r4 + 8], 0
23317pmaddubsw m3, m1, m6
23318pmulhrsw m3, m7
23319pslldq m4, 2
23320pinsrw m4, [r4 + 16], 0
23321pmaddubsw m5, m4, m6
23322pmulhrsw m5, m7
23323packuswb m3, m5
23324movu [r0 + 979 * 16], m3
23325
23326; mode17 [row 10]
23327movu m6, [r5 + 2 * 16]
23328pmaddubsw m3, m0, m6
23329pmulhrsw m3, m7
23330pmaddubsw m5, m2, m6
23331pmulhrsw m5, m7
23332packuswb m3, m5
23333movu [r0 + 980 * 16], m3
23334
23335pmaddubsw m3, m1, m6
23336pmulhrsw m3, m7
23337pmaddubsw m5, m4, m6
23338pmulhrsw m5, m7
23339packuswb m3, m5
23340movu [r0 + 981 * 16], m3
23341
23342; mode17 [row 11]
23343movu m6, [r5 + 8 * 16]
23344pslldq m0, 2
23345pinsrb m0, [r3 + 10], 1
23346pinsrb m0, [r3 + 11], 0
23347pmaddubsw m3, m0, m6
23348pmulhrsw m3, m7
23349pslldq m2, 2
23350pinsrb m2, [r4 + 0], 1
23351pinsrb m2, [r3 + 1], 0
23352pmaddubsw m5, m2, m6
23353pmulhrsw m5, m7
23354packuswb m3, m5
23355movu [r0 + 982 * 16], m3
23356
23357pslldq m1, 2
23358pinsrw m1, [r4 + 7], 0
23359pmaddubsw m3, m1, m6
23360pmulhrsw m3, m7
23361pslldq m4, 2
23362pinsrw m4, [r4 + 15], 0
23363pmaddubsw m5, m4, m6
23364pmulhrsw m5, m7
23365packuswb m3, m5
23366movu [r0 + 983 * 16], m3
23367
23368; mode17 [row 12]
23369movu m6, [r5 + 14 * 16]
23370pslldq m0, 2
23371pinsrb m0, [r3 + 11], 1
23372pinsrb m0, [r3 + 12], 0
23373pmaddubsw m3, m0, m6
23374pmulhrsw m3, m7
23375pslldq m2, 2
23376pinsrb m2, [r3 + 1], 1
23377pinsrb m2, [r3 + 2], 0
23378pmaddubsw m5, m2, m6
23379pmulhrsw m5, m7
23380packuswb m3, m5
23381movu [r0 + 984 * 16], m3
23382
23383pslldq m1, 2
23384pinsrw m1, [r4 + 6], 0
23385pmaddubsw m3, m1, m6
23386pmulhrsw m3, m7
23387pslldq m4, 2
23388pinsrw m4, [r4 + 14], 0
23389pmaddubsw m5, m4, m6
23390pmulhrsw m5, m7
23391packuswb m3, m5
23392movu [r0 + 985 * 16], m3
23393
23394; mode17 [row 13]
23395movu m6, [r5 + 20 * 16]
23396pslldq m0, 2
23397pinsrb m0, [r3 + 12], 1
23398pinsrb m0, [r3 + 14], 0
23399pmaddubsw m3, m0, m6
23400pmulhrsw m3, m7
23401pslldq m2, 2
23402pinsrb m2, [r3 + 2], 1
23403pinsrb m2, [r3 + 4], 0
23404pmaddubsw m5, m2, m6
23405pmulhrsw m5, m7
23406packuswb m3, m5
23407movu [r0 + 986 * 16], m3
23408
23409pslldq m1, 2
23410pinsrw m1, [r4 + 5], 0
23411pmaddubsw m3, m1, m6
23412pmulhrsw m3, m7
23413pslldq m4, 2
23414pinsrw m4, [r4 + 13], 0
23415pmaddubsw m5, m4, m6
23416pmulhrsw m5, m7
23417packuswb m3, m5
23418movu [r0 + 987 * 16], m3
23419
23420; mode17 [row 14]
23421movu m6, [r5 + 26 * 16]
23422pslldq m0, 2
23423pinsrb m0, [r3 + 14], 1
23424pinsrb m0, [r3 + 15], 0
23425pmaddubsw m3, m0, m6
23426pmulhrsw m3, m7
23427pslldq m2, 2
23428pinsrb m2, [r3 + 4], 1
23429pinsrb m2, [r3 + 5], 0
23430pmaddubsw m5, m2, m6
23431pmulhrsw m5, m7
23432packuswb m3, m5
23433movu [r0 + 988 * 16], m3
23434
23435pslldq m1, 2
23436pinsrw m1, [r4 + 4], 0
23437pmaddubsw m3, m1, m6
23438pmulhrsw m3, m7
23439pslldq m4, 2
23440pinsrw m4, [r4 + 12], 0
23441pmaddubsw m5, m4, m6
23442pmulhrsw m5, m7
23443packuswb m3, m5
23444movu [r0 + 989 * 16], m3
23445
23446; mode17 [row 15]
23447pshufb m5, m0, [tab_S2]
23448movh [r0 + 990 * 16], m5
23449pshufb m5, m2, [tab_S2]
23450movh [r0 + 990 * 16 + 8], m5
23451pshufb m5, m1, [tab_S2]
23452movh [r0 + 991 * 16], m5
23453pshufb m5, m4, [tab_S2]
23454movh [r0 + 991 * 16 + 8], m5
23455
23456; mode17 [row 16]
23457movu m6, [r5 + 6 * 16]
23458pslldq m0, 2
23459pinsrb m0, [r3 + 15], 1
23460pinsrb m0, [r3 + 16], 0
23461pmaddubsw m3, m0, m6
23462pmulhrsw m3, m7
23463pslldq m2, 2
23464pinsrb m2, [r3 + 5], 1
23465pinsrb m2, [r3 + 6], 0
23466pmaddubsw m5, m2, m6
23467pmulhrsw m5, m7
23468packuswb m3, m5
23469movu [r0 + 992 * 16], m3
23470
23471pslldq m1, 2
23472pinsrw m1, [r4 + 3], 0
23473pmaddubsw m3, m1, m6
23474pmulhrsw m3, m7
23475pslldq m4, 2
23476pinsrw m4, [r4 + 11], 0
23477pmaddubsw m5, m4, m6
23478pmulhrsw m5, m7
23479packuswb m3, m5
23480movu [r0 + 993 * 16], m3
23481
23482; mode17 [row 17]
23483movu m6, [r5 + 12 * 16]
23484pslldq m0, 2
23485pinsrb m0, [r3 + 16], 1
23486pinsrb m0, [r3 + 17], 0
23487pmaddubsw m3, m0, m6
23488pmulhrsw m3, m7
23489pslldq m2, 2
23490pinsrb m2, [r3 + 6], 1
23491pinsrb m2, [r3 + 7], 0
23492pmaddubsw m5, m2, m6
23493pmulhrsw m5, m7
23494packuswb m3, m5
23495movu [r0 + 994 * 16], m3
23496
23497pslldq m1, 2
23498pinsrw m1, [r4 + 2], 0
23499pmaddubsw m3, m1, m6
23500pmulhrsw m3, m7
23501pslldq m4, 2
23502pinsrw m4, [r4 + 10], 0
23503pmaddubsw m5, m4, m6
23504pmulhrsw m5, m7
23505packuswb m3, m5
23506movu [r0 + 995 * 16], m3
23507
23508; mode17 [row 18]
23509movu m6, [r5 + 18 * 16]
23510pslldq m0, 2
23511pinsrb m0, [r3 + 17], 1
23512pinsrb m0, [r3 + 18], 0
23513pmaddubsw m3, m0, m6
23514pmulhrsw m3, m7
23515pslldq m2, 2
23516pinsrb m2, [r3 + 7], 1
23517pinsrb m2, [r3 + 9], 0
23518pmaddubsw m5, m2, m6
23519pmulhrsw m5, m7
23520packuswb m3, m5
23521movu [r0 + 996 * 16], m3
23522
23523pslldq m1, 2
23524pinsrw m1, [r4 + 1], 0
23525pmaddubsw m3, m1, m6
23526pmulhrsw m3, m7
23527pslldq m4, 2
23528pinsrw m4, [r4 + 9], 0
23529pmaddubsw m5, m4, m6
23530pmulhrsw m5, m7
23531packuswb m3, m5
23532movu [r0 + 997 * 16], m3
23533
23534; mode17 [row 19]
23535movu m6, [r5 + 24 * 16]
23536pslldq m0, 2
23537pinsrb m0, [r3 + 18], 1
23538pinsrb m0, [r3 + 20], 0
23539pmaddubsw m3, m0, m6
23540pmulhrsw m3, m7
23541pslldq m2, 2
23542pinsrb m2, [r3 + 9], 1
23543pinsrb m2, [r3 + 10], 0
23544pmaddubsw m5, m2, m6
23545pmulhrsw m5, m7
23546packuswb m3, m5
23547movu [r0 + 998 * 16], m3
23548
23549pslldq m1, 2
23550pinsrw m1, [r4 + 0], 0
23551pmaddubsw m3, m1, m6
23552pmulhrsw m3, m7
23553pslldq m4, 2
23554pinsrw m4, [r4 + 8], 0
23555pmaddubsw m5, m4, m6
23556pmulhrsw m5, m7
23557packuswb m3, m5
23558movu [r0 + 999 * 16], m3
23559
23560; mode17 [row 20]
23561movu m6, [r5 + 30 * 16]
23562pslldq m0, 2
23563pinsrb m0, [r3 + 20], 1
23564pinsrb m0, [r3 + 21], 0
23565pmaddubsw m3, m0, m6
23566pmulhrsw m3, m7
23567pslldq m2, 2
23568pinsrb m2, [r3 + 10], 1
23569pinsrb m2, [r3 + 11], 0
23570pmaddubsw m5, m2, m6
23571pmulhrsw m5, m7
23572packuswb m3, m5
23573movu [r0 + 1000 * 16], m3
23574
23575pslldq m1, 2
23576pinsrb m1, [r4 + 0], 1
23577pinsrb m1, [r3 + 1], 0
23578pmaddubsw m3, m1, m6
23579pmulhrsw m3, m7
23580pslldq m4, 2
23581;pinsrb m4, [r4 + 8], 1
23582;pinsrb m4, [r4 + 7], 0
23583pinsrw m4, [r4 + 7], 0
23584pmaddubsw m5, m4, m6
23585pmulhrsw m5, m7
23586packuswb m3, m5
23587movu [r0 + 1001 * 16], m3
23588
23589; mode17 [row 21]
23590movu m6, [r5 + 4 * 16]
23591pmaddubsw m3, m0, m6
23592pmulhrsw m3, m7
23593pmaddubsw m5, m2, m6
23594pmulhrsw m5, m7
23595packuswb m3, m5
23596movu [r0 + 1002 * 16], m3
23597
23598pmaddubsw m3, m1, m6
23599pmulhrsw m3, m7
23600pmaddubsw m5, m4, m6
23601pmulhrsw m5, m7
23602packuswb m3, m5
23603movu [r0 + 1003 * 16], m3
23604
23605; mode17 [row 22]
23606movu m6, [r5 + 10 * 16]
23607pslldq m0, 2
23608pinsrb m0, [r3 + 21], 1
23609pinsrb m0, [r3 + 22], 0
23610pmaddubsw m3, m0, m6
23611pmulhrsw m3, m7
23612pslldq m2, 2
23613pinsrb m2, [r3 + 11], 1
23614pinsrb m2, [r3 + 12], 0
23615pmaddubsw m5, m2, m6
23616pmulhrsw m5, m7
23617packuswb m3, m5
23618movu [r0 + 1004 * 16], m3
23619
23620pslldq m1, 2
23621pinsrb m1, [r3 + 1], 1
23622pinsrb m1, [r3 + 2], 0
23623pmaddubsw m3, m1, m6
23624pmulhrsw m3, m7
23625pslldq m4, 2
23626pinsrw m4, [r4 + 6], 0
23627pmaddubsw m5, m4, m6
23628pmulhrsw m5, m7
23629packuswb m3, m5
23630movu [r0 + 1005 * 16], m3
23631
23632; mode17 [row 23]
23633movu m6, [r5 + 16 * 16]
23634pslldq m0, 2
23635pinsrb m0, [r3 + 22], 1
23636pinsrb m0, [r3 + 23], 0
23637pmaddubsw m3, m0, m6
23638pmulhrsw m3, m7
23639pslldq m2, 2
23640pinsrb m2, [r3 + 12], 1
23641pinsrb m2, [r3 + 14], 0
23642pmaddubsw m5, m2, m6
23643pmulhrsw m5, m7
23644packuswb m3, m5
23645movu [r0 + 1006 * 16], m3
23646
23647pslldq m1, 2
23648pinsrb m1, [r3 + 2], 1
23649pinsrb m1, [r3 + 4], 0
23650pmaddubsw m3, m1, m6
23651pmulhrsw m3, m7
23652pslldq m4, 2
23653pinsrw m4, [r4 + 5], 0
23654pmaddubsw m5, m4, m6
23655pmulhrsw m5, m7
23656packuswb m3, m5
23657movu [r0 + 1007 * 16], m3
23658
23659; mode17 [row 24]
23660movu m6, [r5 + 22 * 16]
23661pslldq m0, 2
23662pinsrb m0, [r3 + 23], 1
23663pinsrb m0, [r3 + 25], 0
23664pmaddubsw m3, m0, m6
23665pmulhrsw m3, m7
23666pslldq m2, 2
23667pinsrb m2, [r3 + 14], 1
23668pinsrb m2, [r3 + 15], 0
23669pmaddubsw m5, m2, m6
23670pmulhrsw m5, m7
23671packuswb m3, m5
23672movu [r0 + 1008 * 16], m3
23673
23674pslldq m1, 2
23675pinsrb m1, [r3 + 4], 1
23676pinsrb m1, [r3 + 5], 0
23677pmaddubsw m3, m1, m6
23678pmulhrsw m3, m7
23679pslldq m4, 2
23680pinsrw m4, [r4 + 4], 0
23681pmaddubsw m5, m4, m6
23682pmulhrsw m5, m7
23683packuswb m3, m5
23684movu [r0 + 1009 * 16], m3
23685
23686; mode17 [row 25]
23687movu m6, [r5 + 28 * 16]
23688pslldq m0, 2
23689pinsrb m0, [r3 + 25], 1
23690pinsrb m0, [r3 + 26], 0
23691pmaddubsw m3, m0, m6
23692pmulhrsw m3, m7
23693pslldq m2, 2
23694pinsrb m2, [r3 + 15], 1
23695pinsrb m2, [r3 + 16], 0
23696pmaddubsw m5, m2, m6
23697pmulhrsw m5, m7
23698packuswb m3, m5
23699movu [r0 + 1010 * 16], m3
23700
23701pslldq m1, 2
23702pinsrb m1, [r3 + 5], 1
23703pinsrb m1, [r3 + 6], 0
23704pmaddubsw m3, m1, m6
23705pmulhrsw m3, m7
23706pslldq m4, 2
23707pinsrw m4, [r4 + 3], 0
23708pmaddubsw m5, m4, m6
23709pmulhrsw m5, m7
23710packuswb m3, m5
23711movu [r0 + 1011 * 16], m3
23712
23713; mode17 [row 26]
23714movu m6, [r5 + 2 * 16]
23715pmaddubsw m3, m0, m6
23716pmulhrsw m3, m7
23717pmaddubsw m5, m2, m6
23718pmulhrsw m5, m7
23719packuswb m3, m5
23720movu [r0 + 1012 * 16], m3
23721
23722pmaddubsw m3, m1, m6
23723pmulhrsw m3, m7
23724pmaddubsw m5, m4, m6
23725pmulhrsw m5, m7
23726packuswb m3, m5
23727movu [r0 + 1013 * 16], m3
23728
23729; mode17 [row 27]
23730movu m6, [r5 + 8 * 16]
23731pslldq m0, 2
23732pinsrb m0, [r3 + 26], 1
23733pinsrb m0, [r3 + 27], 0
23734pmaddubsw m3, m0, m6
23735pmulhrsw m3, m7
23736pslldq m2, 2
23737pinsrb m2, [r3 + 16], 1
23738pinsrb m2, [r3 + 17], 0
23739pmaddubsw m5, m2, m6
23740pmulhrsw m5, m7
23741packuswb m3, m5
23742movu [r0 + 1014 * 16], m3
23743
23744pslldq m1, 2
23745pinsrb m1, [r3 + 6], 1
23746pinsrb m1, [r3 + 7], 0
23747pmaddubsw m3, m1, m6
23748pmulhrsw m3, m7
23749pslldq m4, 2
23750pinsrw m4, [r4 + 2], 0
23751pmaddubsw m5, m4, m6
23752pmulhrsw m5, m7
23753packuswb m3, m5
23754movu [r0 + 1015 * 16], m3
23755
23756; mode17 [row 28]
23757movu m6, [r5 + 14 * 16]
23758pslldq m0, 2
23759pinsrb m0, [r3 + 27], 1
23760pinsrb m0, [r3 + 28], 0
23761pmaddubsw m3, m0, m6
23762pmulhrsw m3, m7
23763pslldq m2, 2
23764pinsrb m2, [r3 + 17], 1
23765pinsrb m2, [r3 + 18], 0
23766pmaddubsw m5, m2, m6
23767pmulhrsw m5, m7
23768packuswb m3, m5
23769movu [r0 + 1016 * 16], m3
23770
23771pslldq m1, 2
23772pinsrb m1, [r3 + 7], 1
23773pinsrb m1, [r3 + 9], 0
23774pmaddubsw m3, m1, m6
23775pmulhrsw m3, m7
23776pslldq m4, 2
23777pinsrw m4, [r4 + 1], 0
23778pmaddubsw m5, m4, m6
23779pmulhrsw m5, m7
23780packuswb m3, m5
23781movu [r0 + 1017 * 16], m3
23782
23783; mode17 [row 29]
23784movu m6, [r5 + 20 * 16]
23785pslldq m0, 2
23786pinsrb m0, [r3 + 28], 1
23787pinsrb m0, [r3 + 30], 0
23788pmaddubsw m3, m0, m6
23789pmulhrsw m3, m7
23790pslldq m2, 2
23791pinsrb m2, [r3 + 18], 1
23792pinsrb m2, [r3 + 20], 0
23793pmaddubsw m5, m2, m6
23794pmulhrsw m5, m7
23795packuswb m3, m5
23796movu [r0 + 1018 * 16], m3
23797
23798pslldq m1, 2
23799pinsrb m1, [r3 + 9], 1
23800pinsrb m1, [r3 + 10], 0
23801pmaddubsw m3, m1, m6
23802pmulhrsw m3, m7
23803pslldq m4, 2
23804pinsrw m4, [r4 + 0], 0
23805pmaddubsw m5, m4, m6
23806pmulhrsw m5, m7
23807packuswb m3, m5
23808movu [r0 + 1019 * 16], m3
23809
23810; mode17 [row 30]
23811movu m6, [r5 + 26 * 16]
23812pslldq m0, 2
23813pinsrb m0, [r3 + 30], 1
23814pinsrb m0, [r3 + 31], 0
23815pmaddubsw m3, m0, m6
23816pmulhrsw m3, m7
23817pslldq m2, 2
23818pinsrb m2, [r3 + 20], 1
23819pinsrb m2, [r3 + 21], 0
23820pmaddubsw m5, m2, m6
23821pmulhrsw m5, m7
23822packuswb m3, m5
23823movu [r0 + 1020 * 16], m3
23824
23825pslldq m1, 2
23826pinsrb m1, [r3 + 10], 1
23827pinsrb m1, [r3 + 11], 0
23828pmaddubsw m3, m1, m6
23829pmulhrsw m3, m7
23830pslldq m4, 2
23831pinsrb m4, [r4 + 0], 1
23832pinsrb m4, [r3 + 1], 0
23833pmaddubsw m5, m4, m6
23834pmulhrsw m5, m7
23835packuswb m3, m5
23836movu [r0 + 1021 * 16], m3
23837
23838; mode17 [row 31]
23839pshufb m5, m0, [tab_S2]
23840movh [r0 + 1022 * 16], m5
23841pshufb m5, m2, [tab_S2]
23842movh [r0 + 1022 * 16 + 8], m5
23843pshufb m5, m1, [tab_S2]
23844movh [r0 + 1023 * 16], m5
23845pshufb m5, m4, [tab_S2]
23846movh [r0 + 1023 * 16 + 8], m5
23847
23848;mode 18[row 0]
23849movu m0, [r3]
23850movu [r0 + 1024 * 16], m0
23851movu m1, [r3 + 16]
23852movu [r0 + 1025 * 16], m1
23853
23854;mode 18[row 1]
23855pslldq m0, 1
23856pinsrb m0, [r4 + 1], 0
23857movu [r0 + 1026 * 16], m0
23858pslldq m1, 1
23859pinsrb m1, [r3 + 15], 0
23860movu [r0 + 1027 * 16], m1
23861
23862;mode 18[row 2]
23863pslldq m0, 1
23864pinsrb m0, [r4 + 2], 0
23865movu [r0 + 1028 * 16], m0
23866pslldq m1, 1
23867pinsrb m1, [r3 + 14], 0
23868movu [r0 + 1029 * 16], m1
23869
23870;mode 18[row 3]
23871pslldq m0, 1
23872pinsrb m0, [r4 + 3], 0
23873movu [r0 + 1030 * 16], m0
23874pslldq m1, 1
23875pinsrb m1, [r3 + 13], 0
23876movu [r0 + 1031 * 16], m1
23877
23878;mode 18[row 4]
23879pslldq m0, 1
23880pinsrb m0, [r4 + 4], 0
23881movu [r0 + 1032 * 16], m0
23882pslldq m1, 1
23883pinsrb m1, [r3 + 12], 0
23884movu [r0 + 1033 * 16], m1
23885
23886;mode 18[row 5]
23887pslldq m0, 1
23888pinsrb m0, [r4 + 5], 0
23889movu [r0 + 1034 * 16], m0
23890pslldq m1, 1
23891pinsrb m1, [r3 + 11], 0
23892movu [r0 + 1035 * 16], m1
23893
23894;mode 18[row 6]
23895pslldq m0, 1
23896pinsrb m0, [r4 + 6], 0
23897movu [r0 + 1036 * 16], m0
23898pslldq m1, 1
23899pinsrb m1, [r3 + 10], 0
23900movu [r0 + 1037 * 16], m1
23901
23902;mode 18[row 7]
23903pslldq m0, 1
23904pinsrb m0, [r4 + 7], 0
23905movu [r0 + 1038 * 16], m0
23906pslldq m1, 1
23907pinsrb m1, [r3 + 9], 0
23908movu [r0 + 1039 * 16], m1
23909
23910;mode 18[row 8]
23911pslldq m0, 1
23912pinsrb m0, [r4 + 8], 0
23913movu [r0 + 1040 * 16], m0
23914pslldq m1, 1
23915pinsrb m1, [r3 + 8], 0
23916movu [r0 + 1041 * 16], m1
23917
23918;mode 18[row 9]
23919pslldq m0, 1
23920pinsrb m0, [r4 + 9], 0
23921movu [r0 + 1042 * 16], m0
23922pslldq m1, 1
23923pinsrb m1, [r3 + 7], 0
23924movu [r0 + 1043 * 16], m1
23925
23926;mode 18[row 10]
23927pslldq m0, 1
23928pinsrb m0, [r4 + 10], 0
23929movu [r0 + 1044 * 16], m0
23930pslldq m1, 1
23931pinsrb m1, [r3 + 6], 0
23932movu [r0 + 1045 * 16], m1
23933
23934;mode 18[row 11]
23935pslldq m0, 1
23936pinsrb m0, [r4 + 11], 0
23937movu [r0 + 1046 * 16], m0
23938pslldq m1, 1
23939pinsrb m1, [r3 + 5], 0
23940movu [r0 + 1047 * 16], m1
23941
23942;mode 18[row 12]
23943pslldq m0, 1
23944pinsrb m0, [r4 + 12], 0
23945movu [r0 + 1048 * 16], m0
23946pslldq m1, 1
23947pinsrb m1, [r3 + 4], 0
23948movu [r0 + 1049 * 16], m1
23949
23950;mode 18[row 13]
23951pslldq m0, 1
23952pinsrb m0, [r4 + 13], 0
23953movu [r0 + 1050 * 16], m0
23954pslldq m1, 1
23955pinsrb m1, [r3 + 3], 0
23956movu [r0 + 1051 * 16], m1
23957
23958;mode 18[row 14]
23959pslldq m0, 1
23960pinsrb m0, [r4 + 14], 0
23961movu [r0 + 1052 * 16], m0
23962pslldq m1, 1
23963pinsrb m1, [r3 + 2], 0
23964movu [r0 + 1053 * 16], m1
23965
23966;mode 18[row 15]
23967pslldq m0, 1
23968pinsrb m0, [r4 + 15], 0
23969movu [r0 + 1054 * 16], m0
23970pslldq m1, 1
23971pinsrb m1, [r3 + 1], 0
23972movu [r0 + 1055 * 16], m1
23973
23974;mode 18[row 16]
23975pslldq m0, 1
23976pinsrb m0, [r4 + 16], 0
23977movu [r0 + 1056 * 16], m0
23978pslldq m1, 1
23979pinsrb m1, [r3 + 0], 0
23980movu [r0 + 1057 * 16], m1
23981
23982;mode 18[row 17]
23983pslldq m0, 1
23984pinsrb m0, [r4 + 17], 0
23985movu [r0 + 1058 * 16], m0
23986pslldq m1, 1
23987pinsrb m1, [r4 + 1], 0
23988movu [r0 + 1059 * 16], m1
23989
23990;mode 18[row 18]
23991pslldq m0, 1
23992pinsrb m0, [r4 + 18], 0
23993movu [r0 + 1060 * 16], m0
23994pslldq m1, 1
23995pinsrb m1, [r4 + 2], 0
23996movu [r0 + 1061 * 16], m1
23997
23998;mode 18[row 19]
23999pslldq m0, 1
24000pinsrb m0, [r4 + 19], 0
24001movu [r0 + 1062 * 16], m0
24002pslldq m1, 1
24003pinsrb m1, [r4 + 3], 0
24004movu [r0 + 1063 * 16], m1
24005
24006;mode 18[row 20]
24007pslldq m0, 1
24008pinsrb m0, [r4 + 20], 0
24009movu [r0 + 1064 * 16], m0
24010pslldq m1, 1
24011pinsrb m1, [r4 + 4], 0
24012movu [r0 + 1065 * 16], m1
24013
24014;mode 18[row 21]
24015pslldq m0, 1
24016pinsrb m0, [r4 + 21], 0
24017movu [r0 + 1066 * 16], m0
24018pslldq m1, 1
24019pinsrb m1, [r4 + 5], 0
24020movu [r0 + 1067 * 16], m1
24021
24022;mode 18[row 22]
24023pslldq m0, 1
24024pinsrb m0, [r4 + 22], 0
24025movu [r0 + 1068 * 16], m0
24026pslldq m1, 1
24027pinsrb m1, [r4 + 6], 0
24028movu [r0 + 1069 * 16], m1
24029
24030;mode 18[row 23]
24031pslldq m0, 1
24032pinsrb m0, [r4 + 23], 0
24033movu [r0 + 1070 * 16], m0
24034pslldq m1, 1
24035pinsrb m1, [r4 + 7], 0
24036movu [r0 + 1071 * 16], m1
24037
24038;mode 18[row 24]
24039pslldq m0, 1
24040pinsrb m0, [r4 + 24], 0
24041movu [r0 + 1072 * 16], m0
24042pslldq m1, 1
24043pinsrb m1, [r4 + 8], 0
24044movu [r0 + 1073 * 16], m1
24045
24046;mode 18[row 25]
24047pslldq m0, 1
24048pinsrb m0, [r4 + 25], 0
24049movu [r0 + 1074 * 16], m0
24050pslldq m1, 1
24051pinsrb m1, [r4 + 9], 0
24052movu [r0 + 1075 * 16], m1
24053
24054;mode 18[row 26]
24055pslldq m0, 1
24056pinsrb m0, [r4 + 26], 0
24057movu [r0 + 1076 * 16], m0
24058pslldq m1, 1
24059pinsrb m1, [r4 + 10], 0
24060movu [r0 + 1077 * 16], m1
24061
24062;mode 18[row 27]
24063pslldq m0, 1
24064pinsrb m0, [r4 + 27], 0
24065movu [r0 + 1078 * 16], m0
24066pslldq m1, 1
24067pinsrb m1, [r4 + 11], 0
24068movu [r0 + 1079 * 16], m1
24069
24070;mode 18[row 28]
24071pslldq m0, 1
24072pinsrb m0, [r4 + 28], 0
24073movu [r0 + 1080 * 16], m0
24074pslldq m1, 1
24075pinsrb m1, [r4 + 12], 0
24076movu [r0 + 1081 * 16], m1
24077
24078;mode 18[row 29]
24079pslldq m0, 1
24080pinsrb m0, [r4 + 29], 0
24081movu [r0 + 1082 * 16], m0
24082pslldq m1, 1
24083pinsrb m1, [r4 + 13], 0
24084movu [r0 + 1083 * 16], m1
24085
24086;mode 18[row 30]
24087pslldq m0, 1
24088pinsrb m0, [r4 + 30], 0
24089movu [r0 + 1084 * 16], m0
24090pslldq m1, 1
24091pinsrb m1, [r4 + 14], 0
24092movu [r0 + 1085 * 16], m1
24093
24094;mode 18[row 31]
24095pslldq m0, 1
24096pinsrb m0, [r4 + 31], 0
24097movu [r0 + 1086 * 16], m0
24098pslldq m1, 1
24099pinsrb m1, [r4 + 15], 0
24100movu [r0 + 1087 * 16], m1
24101
24102; mode 19 [row 0]
24103movu m6, [r5 + 6 * 16]
24104movu m0, [r3 ]
24105movu m1, [r3 + 1 ]
24106punpcklbw m0, m1
24107pmaddubsw m1, m0, m6
24108pmulhrsw m1, m7
24109movu m2, [r3 + 8]
24110movu m3, [r3 + 9]
24111punpcklbw m2, m3
24112pmaddubsw m3, m2, m6
24113pmulhrsw m3, m7
24114packuswb m1, m3
24115movu [r0 + 1088 * 16], m1
24116
24117movu m1, [r3 + 16]
24118movu m3, [r3 + 17]
24119punpcklbw m1, m3
24120pmaddubsw m4, m1, m6
24121pmulhrsw m4, m7
24122movu m3, [r3 + 24]
24123movu m5, [r3 + 25]
24124punpcklbw m3, m5
24125pmaddubsw m5, m3, m6
24126pmulhrsw m5, m7
24127packuswb m4, m5
24128movu [r0 + 1089 * 16], m4
24129
24130; mode 19 [row 1]
24131movu m6, [r5 + 12 * 16]
24132pslldq m0, 2
24133pinsrb m0, [r4 + 0], 1
24134pinsrb m0, [r4 + 1], 0
24135pmaddubsw m4, m0, m6
24136pmulhrsw m4, m7
24137pslldq m2, 2
24138pinsrw m2, [r3 + 7], 0
24139pmaddubsw m5, m2, m6
24140pmulhrsw m5, m7
24141packuswb m4, m5
24142movu [r0 + 1090 * 16], m4
24143pslldq m1, 2
24144pinsrw m1, [r3 + 15], 0
24145pmaddubsw m4, m1, m6
24146pmulhrsw m4, m7
24147pslldq m3, 2
24148pinsrw m3, [r3 + 23], 0
24149pmaddubsw m5, m3, m6
24150pmulhrsw m5, m7
24151packuswb m4, m5
24152movu [r0 + 1091 * 16], m4
24153
24154; mode 19 [row 2]
24155movu m6, [r5 + 18 * 16]
24156pslldq m0, 2
24157pinsrb m0, [r4 + 1], 1
24158pinsrb m0, [r4 + 2], 0
24159pmaddubsw m4, m0, m6
24160pmulhrsw m4, m7
24161pslldq m2, 2
24162pinsrw m2, [r3 + 6], 0
24163pmaddubsw m5, m2, m6
24164pmulhrsw m5, m7
24165packuswb m4, m5
24166movu [r0 + 1092 * 16], m4
24167pslldq m1, 2
24168pinsrw m1, [r3 + 14], 0
24169pmaddubsw m4, m1, m6
24170pmulhrsw m4, m7
24171pslldq m3, 2
24172pinsrw m3, [r3 + 22], 0
24173pmaddubsw m5, m3, m6
24174pmulhrsw m5, m7
24175packuswb m4, m5
24176movu [r0 + 1093 * 16], m4
24177
24178; mode 19 [row 3]
24179movu m6, [r5 + 24 * 16]
24180pslldq m0, 2
24181pinsrb m0, [r4 + 2], 1
24182pinsrb m0, [r4 + 4], 0
24183pmaddubsw m4, m0, m6
24184pmulhrsw m4, m7
24185pslldq m2, 2
24186pinsrw m2, [r3 + 5], 0
24187pmaddubsw m5, m2, m6
24188pmulhrsw m5, m7
24189packuswb m4, m5
24190movu [r0 + 1094 * 16], m4
24191pslldq m1, 2
24192pinsrw m1, [r3 + 13], 0
24193pmaddubsw m4, m1, m6
24194pmulhrsw m4, m7
24195pslldq m3, 2
24196pinsrw m3, [r3 + 21], 0
24197pmaddubsw m5, m3, m6
24198pmulhrsw m5, m7
24199packuswb m4, m5
24200movu [r0 + 1095 * 16], m4
24201
24202; mode 19 [row 4]
24203movu m6, [r5 + 30 * 16]
24204pslldq m0, 2
24205pinsrb m0, [r4 + 4], 1
24206pinsrb m0, [r4 + 5], 0
24207pmaddubsw m4, m0, m6
24208pmulhrsw m4, m7
24209pslldq m2, 2
24210pinsrw m2, [r3 + 4], 0
24211pmaddubsw m5, m2, m6
24212pmulhrsw m5, m7
24213packuswb m4, m5
24214movu [r0 + 1096 * 16], m4
24215pslldq m1, 2
24216pinsrw m1, [r3 + 12], 0
24217pmaddubsw m4, m1, m6
24218pmulhrsw m4, m7
24219pslldq m3, 2
24220pinsrw m3, [r3 + 20], 0
24221pmaddubsw m5, m3, m6
24222pmulhrsw m5, m7
24223packuswb m4, m5
24224movu [r0 + 1097 * 16], m4
24225
24226; mode 19 [row 5]
24227movu m6, [r5 + 4 * 16]
24228pmaddubsw m4, m0, m6
24229pmulhrsw m4, m7
24230pmaddubsw m5, m2, m6
24231pmulhrsw m5, m7
24232packuswb m4, m5
24233movu [r0 + 1098 * 16], m4
24234pmaddubsw m4, m1, m6
24235pmulhrsw m4, m7
24236pmaddubsw m5, m3, m6
24237pmulhrsw m5, m7
24238packuswb m4, m5
24239movu [r0 + 1099 * 16], m4
24240
24241; mode 19 [row 6]
24242movu m6, [r5 + 10 * 16]
24243pslldq m0, 2
24244pinsrb m0, [r4 + 5], 1
24245pinsrb m0, [r4 + 6], 0
24246pmaddubsw m4, m0, m6
24247pmulhrsw m4, m7
24248pslldq m2, 2
24249pinsrw m2, [r3 + 3], 0
24250pmaddubsw m5, m2, m6
24251pmulhrsw m5, m7
24252packuswb m4, m5
24253movu [r0 + 1100 * 16], m4
24254pslldq m1, 2
24255pinsrw m1, [r3 + 11], 0
24256pmaddubsw m4, m1, m6
24257pmulhrsw m4, m7
24258pslldq m3, 2
24259pinsrw m3, [r3 + 19], 0
24260pmaddubsw m5, m3, m6
24261pmulhrsw m5, m7
24262packuswb m4, m5
24263movu [r0 + 1101 * 16], m4
24264
24265; mode 19 [row 7]
24266movu m6, [r5 + 16 * 16]
24267pslldq m0, 2
24268pinsrb m0, [r4 + 6], 1
24269pinsrb m0, [r4 + 7], 0
24270pmaddubsw m4, m0, m6
24271pmulhrsw m4, m7
24272pslldq m2, 2
24273pinsrw m2, [r3 + 2], 0
24274pmaddubsw m5, m2, m6
24275pmulhrsw m5, m7
24276packuswb m4, m5
24277movu [r0 + 1102 * 16], m4
24278pslldq m1, 2
24279pinsrw m1, [r3 + 10], 0
24280pmaddubsw m4, m1, m6
24281pmulhrsw m4, m7
24282pslldq m3, 2
24283pinsrw m3, [r3 + 18], 0
24284pmaddubsw m5, m3, m6
24285pmulhrsw m5, m7
24286packuswb m4, m5
24287movu [r0 + 1103 * 16], m4
24288
24289; mode 19 [row 8]
24290movu m6, [r5 + 22 * 16]
24291pslldq m0, 2
24292pinsrb m0, [r4 + 7], 1
24293pinsrb m0, [r4 + 9], 0
24294pmaddubsw m4, m0, m6
24295pmulhrsw m4, m7
24296pslldq m2, 2
24297pinsrw m2, [r3 + 1], 0
24298pmaddubsw m5, m2, m6
24299pmulhrsw m5, m7
24300packuswb m4, m5
24301movu [r0 + 1104 * 16], m4
24302pslldq m1, 2
24303pinsrw m1, [r3 + 9], 0
24304pmaddubsw m4, m1, m6
24305pmulhrsw m4, m7
24306pslldq m3, 2
24307pinsrw m3, [r3 + 17], 0
24308pmaddubsw m5, m3, m6
24309pmulhrsw m5, m7
24310packuswb m4, m5
24311movu [r0 + 1105 * 16], m4
24312
24313; mode 19 [row 9]
24314movu m6, [r5 + 28 * 16]
24315pslldq m0, 2
24316pinsrb m0, [r4 + 9], 1
24317pinsrb m0, [r4 + 10], 0
24318pmaddubsw m4, m0, m6
24319pmulhrsw m4, m7
24320pslldq m2, 2
24321pinsrw m2, [r3 + 0], 0
24322pmaddubsw m5, m2, m6
24323pmulhrsw m5, m7
24324packuswb m4, m5
24325movu [r0 + 1106 * 16], m4
24326pslldq m1, 2
24327pinsrw m1, [r3 + 8], 0
24328pmaddubsw m4, m1, m6
24329pmulhrsw m4, m7
24330pslldq m3, 2
24331pinsrw m3, [r3 + 16], 0
24332pmaddubsw m5, m3, m6
24333pmulhrsw m5, m7
24334packuswb m4, m5
24335movu [r0 + 1107 * 16], m4
24336
24337; mode 19 [row 10]
24338movu m6, [r5 + 2 * 16]
24339pmaddubsw m4, m0, m6
24340pmulhrsw m4, m7
24341pmaddubsw m5, m2, m6
24342pmulhrsw m5, m7
24343packuswb m4, m5
24344movu [r0 + 1108 * 16], m4
24345pmaddubsw m4, m1, m6
24346pmulhrsw m4, m7
24347pmaddubsw m5, m3, m6
24348pmulhrsw m5, m7
24349packuswb m4, m5
24350movu [r0 + 1109 * 16], m4
24351
24352; mode 19 [row 11]
24353movu m6, [r5 + 8 * 16]
24354pslldq m0, 2
24355pinsrb m0, [r4 + 10], 1
24356pinsrb m0, [r4 + 11], 0
24357pmaddubsw m4, m0, m6
24358pmulhrsw m4, m7
24359pslldq m2, 2
24360pinsrb m2, [r3 + 0], 1
24361pinsrb m2, [r4 + 1], 0
24362pmaddubsw m5, m2, m6
24363pmulhrsw m5, m7
24364packuswb m4, m5
24365movu [r0 + 1110 * 16], m4
24366pslldq m1, 2
24367pinsrw m1, [r3 + 7], 0
24368pmaddubsw m4, m1, m6
24369pmulhrsw m4, m7
24370pslldq m3, 2
24371pinsrw m3, [r3 + 15], 0
24372pmaddubsw m5, m3, m6
24373pmulhrsw m5, m7
24374packuswb m4, m5
24375movu [r0 + 1111 * 16], m4
24376
24377; mode 19 [row 12]
24378movu m6, [r5 + 14 * 16]
24379pslldq m0, 2
24380pinsrb m0, [r4 + 11], 1
24381pinsrb m0, [r4 + 12], 0
24382pmaddubsw m4, m0, m6
24383pmulhrsw m4, m7
24384pslldq m2, 2
24385pinsrb m2, [r4 + 1], 1
24386pinsrb m2, [r4 + 2], 0
24387pmaddubsw m5, m2, m6
24388pmulhrsw m5, m7
24389packuswb m4, m5
24390movu [r0 + 1112 * 16], m4
24391pslldq m1, 2
24392pinsrw m1, [r3 + 6], 0
24393pmaddubsw m4, m1, m6
24394pmulhrsw m4, m7
24395pslldq m3, 2
24396pinsrw m3, [r3 + 14], 0
24397pmaddubsw m5, m3, m6
24398pmulhrsw m5, m7
24399packuswb m4, m5
24400movu [r0 + 1113 * 16], m4
24401
24402; mode 19 [row 13]
24403movu m6, [r5 + 20 * 16]
24404pslldq m0, 2
24405pinsrb m0, [r4 + 12], 1
24406pinsrb m0, [r4 + 14], 0
24407pmaddubsw m4, m0, m6
24408pmulhrsw m4, m7
24409pslldq m2, 2
24410pinsrb m2, [r4 + 2], 1
24411pinsrb m2, [r4 + 4], 0
24412pmaddubsw m5, m2, m6
24413pmulhrsw m5, m7
24414packuswb m4, m5
24415movu [r0 + 1114 * 16], m4
24416pslldq m1, 2
24417pinsrw m1, [r3 + 5], 0
24418pmaddubsw m4, m1, m6
24419pmulhrsw m4, m7
24420pslldq m3, 2
24421pinsrw m3, [r3 + 13], 0
24422pmaddubsw m5, m3, m6
24423pmulhrsw m5, m7
24424packuswb m4, m5
24425movu [r0 + 1115 * 16], m4
24426
24427; mode 19 [row 14]
24428movu m6, [r5 + 26 * 16]
24429pslldq m0, 2
24430pinsrb m0, [r4 + 14], 1
24431pinsrb m0, [r4 + 15], 0
24432pmaddubsw m4, m0, m6
24433pmulhrsw m4, m7
24434pslldq m2, 2
24435pinsrb m2, [r4 + 4], 1
24436pinsrb m2, [r4 + 5], 0
24437pmaddubsw m5, m2, m6
24438pmulhrsw m5, m7
24439packuswb m4, m5
24440movu [r0 + 1116 * 16], m4
24441pslldq m1, 2
24442pinsrw m1, [r3 + 4], 0
24443pmaddubsw m4, m1, m6
24444pmulhrsw m4, m7
24445pslldq m3, 2
24446pinsrw m3, [r3 + 12], 0
24447pmaddubsw m5, m3, m6
24448pmulhrsw m5, m7
24449packuswb m4, m5
24450movu [r0 + 1117 * 16], m4
24451
24452; mode19 [row 15]
24453pshufb m5, m0, [tab_S2]
24454movh [r0 + 1118 * 16], m5
24455pshufb m5, m2, [tab_S2]
24456movh [r0 + 1118 * 16 + 8], m5
24457pshufb m5, m1, [tab_S2]
24458movh [r0 + 1119 * 16], m5
24459pshufb m5, m3, [tab_S2]
24460movh [r0 + 1119 * 16 + 8], m5
24461
24462; mode 19 [row 16]
24463movu m6, [r5 + 6 * 16]
24464pslldq m0, 2
24465pinsrb m0, [r4 + 15], 1
24466pinsrb m0, [r4 + 16], 0
24467pmaddubsw m4, m0, m6
24468pmulhrsw m4, m7
24469pslldq m2, 2
24470pinsrb m2, [r4 + 5], 1
24471pinsrb m2, [r4 + 6], 0
24472pmaddubsw m5, m2, m6
24473pmulhrsw m5, m7
24474packuswb m4, m5
24475movu [r0 + 1120 * 16], m4
24476pslldq m1, 2
24477pinsrw m1, [r3 + 3], 0
24478pmaddubsw m4, m1, m6
24479pmulhrsw m4, m7
24480pslldq m3, 2
24481pinsrw m3, [r3 + 11], 0
24482pmaddubsw m5, m3, m6
24483pmulhrsw m5, m7
24484packuswb m4, m5
24485movu [r0 + 1121 * 16], m4
24486
24487; mode 19 [row 17]
24488movu m6, [r5 + 12 * 16]
24489pslldq m0, 2
24490pinsrb m0, [r4 + 16], 1
24491pinsrb m0, [r4 + 17], 0
24492pmaddubsw m4, m0, m6
24493pmulhrsw m4, m7
24494pslldq m2, 2
24495pinsrb m2, [r4 + 6], 1
24496pinsrb m2, [r4 + 7], 0
24497pmaddubsw m5, m2, m6
24498pmulhrsw m5, m7
24499packuswb m4, m5
24500movu [r0 + 1122 * 16], m4
24501pslldq m1, 2
24502pinsrw m1, [r3 + 2], 0
24503pmaddubsw m4, m1, m6
24504pmulhrsw m4, m7
24505pslldq m3, 2
24506pinsrw m3, [r3 + 10], 0
24507pmaddubsw m5, m3, m6
24508pmulhrsw m5, m7
24509packuswb m4, m5
24510movu [r0 + 1123 * 16], m4
24511
24512; mode 19 [row 18]
24513movu m6, [r5 + 18 * 16]
24514pslldq m0, 2
24515pinsrb m0, [r4 + 17], 1
24516pinsrb m0, [r4 + 18], 0
24517pmaddubsw m4, m0, m6
24518pmulhrsw m4, m7
24519pslldq m2, 2
24520pinsrb m2, [r4 + 7], 1
24521pinsrb m2, [r4 + 9], 0
24522pmaddubsw m5, m2, m6
24523pmulhrsw m5, m7
24524packuswb m4, m5
24525movu [r0 + 1124 * 16], m4
24526pslldq m1, 2
24527pinsrw m1, [r3 + 1], 0
24528pmaddubsw m4, m1, m6
24529pmulhrsw m4, m7
24530pslldq m3, 2
24531pinsrw m3, [r3 + 9], 0
24532pmaddubsw m5, m3, m6
24533pmulhrsw m5, m7
24534packuswb m4, m5
24535movu [r0 + 1125 * 16], m4
24536
24537; mode 19 [row 19]
24538movu m6, [r5 + 24 * 16]
24539pslldq m0, 2
24540pinsrb m0, [r4 + 18], 1
24541pinsrb m0, [r4 + 20], 0
24542pmaddubsw m4, m0, m6
24543pmulhrsw m4, m7
24544pslldq m2, 2
24545pinsrb m2, [r4 + 9], 1
24546pinsrb m2, [r4 + 10], 0
24547pmaddubsw m5, m2, m6
24548pmulhrsw m5, m7
24549packuswb m4, m5
24550movu [r0 + 1126 * 16], m4
24551pslldq m1, 2
24552pinsrw m1, [r3 + 0], 0
24553pmaddubsw m4, m1, m6
24554pmulhrsw m4, m7
24555pslldq m3, 2
24556pinsrw m3, [r3 + 8], 0
24557pmaddubsw m5, m3, m6
24558pmulhrsw m5, m7
24559packuswb m4, m5
24560movu [r0 + 1127 * 16], m4
24561
24562; mode 19 [row 20]
24563movu m6, [r5 + 30 * 16]
24564pslldq m0, 2
24565pinsrb m0, [r4 + 20], 1
24566pinsrb m0, [r4 + 21], 0
24567pmaddubsw m4, m0, m6
24568pmulhrsw m4, m7
24569pslldq m2, 2
24570pinsrb m2, [r4 + 10], 1
24571pinsrb m2, [r4 + 11], 0
24572pmaddubsw m5, m2, m6
24573pmulhrsw m5, m7
24574packuswb m4, m5
24575movu [r0 + 1128 * 16], m4
24576pslldq m1, 2
24577pinsrb m1, [r4 + 0], 1
24578pinsrb m1, [r4 + 1], 0
24579pmaddubsw m4, m1, m6
24580pmulhrsw m4, m7
24581pslldq m3, 2
24582pinsrb m3, [r3 + 8], 1
24583pinsrb m3, [r3 + 7], 0
24584pmaddubsw m5, m3, m6
24585pmulhrsw m5, m7
24586packuswb m4, m5
24587movu [r0 + 1129 * 16], m4
24588
24589; mode 19 [row 21]
24590movu m6, [r5 + 4 * 16]
24591pmaddubsw m4, m0, m6
24592pmulhrsw m4, m7
24593pmaddubsw m5, m2, m6
24594pmulhrsw m5, m7
24595packuswb m4, m5
24596movu [r0 + 1130 * 16], m4
24597pmaddubsw m4, m1, m6
24598pmulhrsw m4, m7
24599pmaddubsw m5, m3, m6
24600pmulhrsw m5, m7
24601packuswb m4, m5
24602movu [r0 + 1131 * 16], m4
24603
24604; mode 19 [row 22]
24605movu m6, [r5 + 10 * 16]
24606pslldq m0, 2
24607pinsrb m0, [r4 + 21], 1
24608pinsrb m0, [r4 + 22], 0
24609pmaddubsw m4, m0, m6
24610pmulhrsw m4, m7
24611pslldq m2, 2
24612pinsrb m2, [r4 + 11], 1
24613pinsrb m2, [r4 + 12], 0
24614pmaddubsw m5, m2, m6
24615pmulhrsw m5, m7
24616packuswb m4, m5
24617movu [r0 + 1132 * 16], m4
24618pslldq m1, 2
24619pinsrb m1, [r4 + 1], 1
24620pinsrb m1, [r4 + 2], 0
24621pmaddubsw m4, m1, m6
24622pmulhrsw m4, m7
24623pslldq m3, 2
24624pinsrw m3, [r3 + 6], 0
24625pmaddubsw m5, m3, m6
24626pmulhrsw m5, m7
24627packuswb m4, m5
24628movu [r0 + 1133 * 16], m4
24629
24630; mode 19 [row 23]
24631movu m6, [r5 + 16 * 16]
24632pslldq m0, 2
24633pinsrb m0, [r4 + 22], 1
24634pinsrb m0, [r4 + 23], 0
24635pmaddubsw m4, m0, m6
24636pmulhrsw m4, m7
24637pslldq m2, 2
24638pinsrb m2, [r4 + 12], 1
24639pinsrb m2, [r4 + 14], 0
24640pmaddubsw m5, m2, m6
24641pmulhrsw m5, m7
24642packuswb m4, m5
24643movu [r0 + 1134 * 16], m4
24644pslldq m1, 2
24645pinsrb m1, [r4 + 2], 1
24646pinsrb m1, [r4 + 4], 0
24647pmaddubsw m4, m1, m6
24648pmulhrsw m4, m7
24649pslldq m3, 2
24650pinsrw m3, [r3 + 5], 0
24651pmaddubsw m5, m3, m6
24652pmulhrsw m5, m7
24653packuswb m4, m5
24654movu [r0 + 1135 * 16], m4
24655
24656; mode 19 [row 24]
24657movu m6, [r5 + 22 * 16]
24658pslldq m0, 2
24659pinsrb m0, [r4 + 23], 1
24660pinsrb m0, [r4 + 25], 0
24661pmaddubsw m4, m0, m6
24662pmulhrsw m4, m7
24663pslldq m2, 2
24664pinsrb m2, [r4 + 14], 1
24665pinsrb m2, [r4 + 15], 0
24666pmaddubsw m5, m2, m6
24667pmulhrsw m5, m7
24668packuswb m4, m5
24669movu [r0 + 1136 * 16], m4
24670pslldq m1, 2
24671pinsrb m1, [r4 + 4], 1
24672pinsrb m1, [r4 + 5], 0
24673pmaddubsw m4, m1, m6
24674pmulhrsw m4, m7
24675pslldq m3, 2
24676pinsrw m3, [r3 + 4], 0
24677pmaddubsw m5, m3, m6
24678pmulhrsw m5, m7
24679packuswb m4, m5
24680movu [r0 + 1137 * 16], m4
24681
24682; mode 19 [row 25]
24683movu m6, [r5 + 28 * 16]
24684pslldq m0, 2
24685pinsrb m0, [r4 + 25], 1
24686pinsrb m0, [r4 + 26], 0
24687pmaddubsw m4, m0, m6
24688pmulhrsw m4, m7
24689pslldq m2, 2
24690pinsrb m2, [r4 + 15], 1
24691pinsrb m2, [r4 + 16], 0
24692pmaddubsw m5, m2, m6
24693pmulhrsw m5, m7
24694packuswb m4, m5
24695movu [r0 + 1138 * 16], m4
24696pslldq m1, 2
24697pinsrb m1, [r4 + 5], 1
24698pinsrb m1, [r4 + 6], 0
24699pmaddubsw m4, m1, m6
24700pmulhrsw m4, m7
24701pslldq m3, 2
24702pinsrw m3, [r3 + 3], 0
24703pmaddubsw m5, m3, m6
24704pmulhrsw m5, m7
24705packuswb m4, m5
24706movu [r0 + 1139 * 16], m4
24707
24708; mode 19 [row 26]
24709movu m6, [r5 + 2 * 16]
24710pmaddubsw m4, m0, m6
24711pmulhrsw m4, m7
24712pmaddubsw m5, m2, m6
24713pmulhrsw m5, m7
24714packuswb m4, m5
24715movu [r0 + 1140 * 16], m4
24716pmaddubsw m4, m1, m6
24717pmulhrsw m4, m7
24718pmaddubsw m5, m3, m6
24719pmulhrsw m5, m7
24720packuswb m4, m5
24721movu [r0 + 1141 * 16], m4
24722
24723; mode 19 [row 27]
24724movu m6, [r5 + 8 * 16]
24725pslldq m0, 2
24726pinsrb m0, [r4 + 26], 1
24727pinsrb m0, [r4 + 27], 0
24728pmaddubsw m4, m0, m6
24729pmulhrsw m4, m7
24730pslldq m2, 2
24731pinsrb m2, [r4 + 16], 1
24732pinsrb m2, [r4 + 17], 0
24733pmaddubsw m5, m2, m6
24734pmulhrsw m5, m7
24735packuswb m4, m5
24736movu [r0 + 1142 * 16], m4
24737pslldq m1, 2
24738pinsrb m1, [r4 + 6], 1
24739pinsrb m1, [r4 + 7], 0
24740pmaddubsw m4, m1, m6
24741pmulhrsw m4, m7
24742pslldq m3, 2
24743pinsrw m3, [r3 + 2], 0
24744pmaddubsw m5, m3, m6
24745pmulhrsw m5, m7
24746packuswb m4, m5
24747movu [r0 + 1143 * 16], m4
24748
24749; mode 19 [row 28]
24750movu m6, [r5 + 14 * 16]
24751pslldq m0, 2
24752pinsrb m0, [r4 + 27], 1
24753pinsrb m0, [r4 + 28], 0
24754pmaddubsw m4, m0, m6
24755pmulhrsw m4, m7
24756pslldq m2, 2
24757pinsrb m2, [r4 + 17], 1
24758pinsrb m2, [r4 + 18], 0
24759pmaddubsw m5, m2, m6
24760pmulhrsw m5, m7
24761packuswb m4, m5
24762movu [r0 + 1144 * 16], m4
24763pslldq m1, 2
24764pinsrb m1, [r4 + 7], 1
24765pinsrb m1, [r4 + 9], 0
24766pmaddubsw m4, m1, m6
24767pmulhrsw m4, m7
24768pslldq m3, 2
24769pinsrw m3, [r3 + 1], 0
24770pmaddubsw m5, m3, m6
24771pmulhrsw m5, m7
24772packuswb m4, m5
24773movu [r0 + 1145 * 16], m4
24774
24775; mode 19 [row 29]
24776movu m6, [r5 + 20 * 16]
24777pslldq m0, 2
24778pinsrb m0, [r4 + 28], 1
24779pinsrb m0, [r4 + 30], 0
24780pmaddubsw m4, m0, m6
24781pmulhrsw m4, m7
24782pslldq m2, 2
24783pinsrb m2, [r4 + 18], 1
24784pinsrb m2, [r4 + 20], 0
24785pmaddubsw m5, m2, m6
24786pmulhrsw m5, m7
24787packuswb m4, m5
24788movu [r0 + 1146 * 16], m4
24789pslldq m1, 2
24790pinsrb m1, [r4 + 9], 1
24791pinsrb m1, [r4 + 10], 0
24792pmaddubsw m4, m1, m6
24793pmulhrsw m4, m7
24794pslldq m3, 2
24795pinsrw m3, [r3 + 0], 0
24796pmaddubsw m5, m3, m6
24797pmulhrsw m5, m7
24798packuswb m4, m5
24799movu [r0 + 1147 * 16], m4
24800
24801; mode 19 [row 30]
24802movu m6, [r5 + 26 * 16]
24803pslldq m0, 2
24804pinsrb m0, [r4 + 30], 1
24805pinsrb m0, [r4 + 31], 0
24806pmaddubsw m4, m0, m6
24807pmulhrsw m4, m7
24808pslldq m2, 2
24809pinsrb m2, [r4 + 20], 1
24810pinsrb m2, [r4 + 21], 0
24811pmaddubsw m5, m2, m6
24812pmulhrsw m5, m7
24813packuswb m4, m5
24814movu [r0 + 1148 * 16], m4
24815pslldq m1, 2
24816pinsrb m1, [r4 + 10], 1
24817pinsrb m1, [r4 + 11], 0
24818pmaddubsw m4, m1, m6
24819pmulhrsw m4, m7
24820pslldq m3, 2
24821pinsrb m3, [r4 + 0], 1
24822pinsrb m3, [r4 + 1], 0
24823pmaddubsw m5, m3, m6
24824pmulhrsw m5, m7
24825packuswb m4, m5
24826movu [r0 + 1149 * 16], m4
24827
24828; mode19 [row 31]
24829pshufb m5, m0, [tab_S2]
24830movh [r0 + 1150 * 16], m5
24831pshufb m5, m2, [tab_S2]
24832movh [r0 + 1150 * 16 + 8], m5
24833pshufb m5, m1, [tab_S2]
24834movh [r0 + 1151 * 16], m5
24835pshufb m5, m3, [tab_S2]
24836movh [r0 + 1151 * 16 + 8], m5
24837
24838; mode 20 [row 0]
24839movu m6, [r5 + 11 * 16]
24840movu m0, [r3 ]
24841movu m1, [r3 + 1 ]
24842punpcklbw m0, m1
24843pmaddubsw m1, m0, m6
24844pmulhrsw m1, m7
24845movu m2, [r3 + 8]
24846movu m3, [r3 + 9]
24847punpcklbw m2, m3
24848pmaddubsw m3, m2, m6
24849pmulhrsw m3, m7
24850packuswb m1, m3
24851movu [r0 + 1152 * 16], m1
24852
24853movu m1, [r3 + 16]
24854movu m3, [r3 + 17]
24855punpcklbw m1, m3
24856pmaddubsw m4, m1, m6
24857pmulhrsw m4, m7
24858movu m3, [r3 + 24]
24859movu m5, [r3 + 25]
24860punpcklbw m3, m5
24861pmaddubsw m5, m3, m6
24862pmulhrsw m5, m7
24863packuswb m4, m5
24864movu [r0 + 1153 * 16], m4
24865
24866; mode 20 [row 1]
24867movu m6, [r5 + 22 * 16]
24868pslldq m0, 2
24869pinsrb m0, [r4 + 0], 1
24870pinsrb m0, [r4 + 2], 0
24871pmaddubsw m4, m0, m6
24872pmulhrsw m4, m7
24873pslldq m2, 2
24874pinsrw m2, [r3 + 7], 0
24875pmaddubsw m5, m2, m6
24876pmulhrsw m5, m7
24877packuswb m4, m5
24878movu [r0 + 1154 * 16], m4
24879pslldq m1, 2
24880pinsrw m1, [r3 + 15], 0
24881pmaddubsw m4, m1, m6
24882pmulhrsw m4, m7
24883pslldq m3, 2
24884pinsrw m3, [r3 + 23], 0
24885pmaddubsw m5, m3, m6
24886pmulhrsw m5, m7
24887packuswb m4, m5
24888movu [r0 + 1155 * 16], m4
24889
24890; mode 20 [row 2]
24891movu m6, [r5 + 1 * 16]
24892pmaddubsw m4, m0, m6
24893pmulhrsw m4, m7
24894pmaddubsw m5, m2, m6
24895pmulhrsw m5, m7
24896packuswb m4, m5
24897movu [r0 + 1156 * 16], m4
24898pmaddubsw m4, m1, m6
24899pmulhrsw m4, m7
24900pmaddubsw m5, m3, m6
24901pmulhrsw m5, m7
24902packuswb m4, m5
24903movu [r0 + 1157 * 16], m4
24904
24905; mode 20 [row 3]
24906movu m6, [r5 + 12 * 16]
24907pslldq m0, 2
24908pinsrb m0, [r4 + 2], 1
24909pinsrb m0, [r4 + 3], 0
24910pmaddubsw m4, m0, m6
24911pmulhrsw m4, m7
24912pslldq m2, 2
24913pinsrw m2, [r3 + 6], 0
24914pmaddubsw m5, m2, m6
24915pmulhrsw m5, m7
24916packuswb m4, m5
24917movu [r0 + 1158 * 16], m4
24918pslldq m1, 2
24919pinsrw m1, [r3 + 14], 0
24920pmaddubsw m4, m1, m6
24921pmulhrsw m4, m7
24922pslldq m3, 2
24923pinsrw m3, [r3 + 22], 0
24924pmaddubsw m5, m3, m6
24925pmulhrsw m5, m7
24926packuswb m4, m5
24927movu [r0 + 1159 * 16], m4
24928
24929; mode 20 [row 4]
24930movu m6, [r5 + 23 * 16]
24931pslldq m0, 2
24932pinsrb m0, [r4 + 3], 1
24933pinsrb m0, [r4 + 5], 0
24934pmaddubsw m4, m0, m6
24935pmulhrsw m4, m7
24936pslldq m2, 2
24937pinsrw m2, [r3 + 5], 0
24938pmaddubsw m5, m2, m6
24939pmulhrsw m5, m7
24940packuswb m4, m5
24941movu [r0 + 1160 * 16], m4
24942pslldq m1, 2
24943pinsrw m1, [r3 + 13], 0
24944pmaddubsw m4, m1, m6
24945pmulhrsw m4, m7
24946pslldq m3, 2
24947pinsrw m3, [r3 + 21], 0
24948pmaddubsw m5, m3, m6
24949pmulhrsw m5, m7
24950packuswb m4, m5
24951movu [r0 + 1161 * 16], m4
24952
24953; mode 20 [row 5]
24954movu m6, [r5 + 2 * 16]
24955pmaddubsw m4, m0, m6
24956pmulhrsw m4, m7
24957pmaddubsw m5, m2, m6
24958pmulhrsw m5, m7
24959packuswb m4, m5
24960movu [r0 + 1162 * 16], m4
24961pmaddubsw m4, m1, m6
24962pmulhrsw m4, m7
24963pmaddubsw m5, m3, m6
24964pmulhrsw m5, m7
24965packuswb m4, m5
24966movu [r0 + 1163 * 16], m4
24967
24968; mode 20 [row 6]
24969movu m6, [r5 + 13 * 16]
24970pslldq m0, 2
24971pinsrb m0, [r4 + 5], 1
24972pinsrb m0, [r4 + 6], 0
24973pmaddubsw m4, m0, m6
24974pmulhrsw m4, m7
24975pslldq m2, 2
24976pinsrw m2, [r3 + 4], 0
24977pmaddubsw m5, m2, m6
24978pmulhrsw m5, m7
24979packuswb m4, m5
24980movu [r0 + 1164 * 16], m4
24981pslldq m1, 2
24982pinsrw m1, [r3 + 12], 0
24983pmaddubsw m4, m1, m6
24984pmulhrsw m4, m7
24985pslldq m3, 2
24986pinsrw m3, [r3 + 20], 0
24987pmaddubsw m5, m3, m6
24988pmulhrsw m5, m7
24989packuswb m4, m5
24990movu [r0 + 1165 * 16], m4
24991
24992; mode 20 [row 7]
24993movu m6, [r5 + 24 * 16]
24994pslldq m0, 2
24995pinsrb m0, [r4 + 6], 1
24996pinsrb m0, [r4 + 8], 0
24997pmaddubsw m4, m0, m6
24998pmulhrsw m4, m7
24999pslldq m2, 2
25000pinsrw m2, [r3 + 3], 0
25001pmaddubsw m5, m2, m6
25002pmulhrsw m5, m7
25003packuswb m4, m5
25004movu [r0 + 1166 * 16], m4
25005pslldq m1, 2
25006pinsrw m1, [r3 + 11], 0
25007pmaddubsw m4, m1, m6
25008pmulhrsw m4, m7
25009pslldq m3, 2
25010pinsrw m3, [r3 + 19], 0
25011pmaddubsw m5, m3, m6
25012pmulhrsw m5, m7
25013packuswb m4, m5
25014movu [r0 + 1167 * 16], m4
25015
25016; mode 20 [row 8]
25017movu m6, [r5 + 3 * 16]
25018pmaddubsw m4, m0, m6
25019pmulhrsw m4, m7
25020pmaddubsw m5, m2, m6
25021pmulhrsw m5, m7
25022packuswb m4, m5
25023movu [r0 + 1168 * 16], m4
25024pmaddubsw m4, m1, m6
25025pmulhrsw m4, m7
25026pmaddubsw m5, m3, m6
25027pmulhrsw m5, m7
25028packuswb m4, m5
25029movu [r0 + 1169 * 16], m4
25030
25031; mode 20 [row 9]
25032movu m6, [r5 + 14 * 16]
25033pslldq m0, 2
25034pinsrb m0, [r4 + 8], 1
25035pinsrb m0, [r4 + 9], 0
25036pmaddubsw m4, m0, m6
25037pmulhrsw m4, m7
25038pslldq m2, 2
25039pinsrb m2, [r3 + 3], 1
25040pinsrb m2, [r3 + 2], 0
25041pmaddubsw m5, m2, m6
25042pmulhrsw m5, m7
25043packuswb m4, m5
25044movu [r0 + 1170 * 16], m4
25045pslldq m1, 2
25046pinsrw m1, [r3 + 10], 0
25047pmaddubsw m4, m1, m6
25048pmulhrsw m4, m7
25049pslldq m3, 2
25050pinsrw m3, [r3 + 18], 0
25051pmaddubsw m5, m3, m6
25052pmulhrsw m5, m7
25053packuswb m4, m5
25054movu [r0 + 1171 * 16], m4
25055
25056; mode 20 [row 10]
25057movu m6, [r5 + 25 * 16]
25058pslldq m0, 2
25059pinsrb m0, [r4 + 9], 1
25060pinsrb m0, [r4 + 11], 0
25061pmaddubsw m4, m0, m6
25062pmulhrsw m4, m7
25063pslldq m2, 2
25064pinsrw m2, [r3 + 1], 0
25065pmaddubsw m5, m2, m6
25066pmulhrsw m5, m7
25067packuswb m4, m5
25068movu [r0 + 1172 * 16], m4
25069pslldq m1, 2
25070pinsrw m1, [r3 + 9], 0
25071pmaddubsw m4, m1, m6
25072pmulhrsw m4, m7
25073pslldq m3, 2
25074pinsrw m3, [r3 + 17], 0
25075pmaddubsw m5, m3, m6
25076pmulhrsw m5, m7
25077packuswb m4, m5
25078movu [r0 + 1173 * 16], m4
25079
25080; mode 20 [row 11]
25081movu m6, [r5 + 4 * 16]
25082pmaddubsw m4, m0, m6
25083pmulhrsw m4, m7
25084pmaddubsw m5, m2, m6
25085pmulhrsw m5, m7
25086packuswb m4, m5
25087movu [r0 + 1174 * 16], m4
25088pmaddubsw m4, m1, m6
25089pmulhrsw m4, m7
25090pmaddubsw m5, m3, m6
25091pmulhrsw m5, m7
25092packuswb m4, m5
25093movu [r0 + 1175 * 16], m4
25094
25095; mode 20 [row 12]
25096movu m6, [r5 + 15 * 16]
25097pslldq m0, 2
25098pinsrb m0, [r4 + 11], 1
25099pinsrb m0, [r4 + 12], 0
25100pmaddubsw m4, m0, m6
25101pmulhrsw m4, m7
25102pslldq m2, 2
25103pinsrb m2, [r3 + 1], 1
25104pinsrb m2, [r3 + 0], 0
25105pmaddubsw m5, m2, m6
25106pmulhrsw m5, m7
25107packuswb m4, m5
25108movu [r0 + 1176 * 16], m4
25109pslldq m1, 2
25110pinsrw m1, [r3 + 8], 0
25111pmaddubsw m4, m1, m6
25112pmulhrsw m4, m7
25113pslldq m3, 2
25114pinsrw m3, [r3 + 16], 0
25115pmaddubsw m5, m3, m6
25116pmulhrsw m5, m7
25117packuswb m4, m5
25118movu [r0 + 1177 * 16], m4
25119
25120; mode 20 [row 13]
25121movu m6, [r5 + 26 * 16]
25122pslldq m0, 2
25123pinsrb m0, [r4 + 12], 1
25124pinsrb m0, [r4 + 14], 0
25125pmaddubsw m4, m0, m6
25126pmulhrsw m4, m7
25127pslldq m2, 2
25128pinsrb m2, [r4 + 0], 1
25129pinsrb m2, [r4 + 2], 0
25130pmaddubsw m5, m2, m6
25131pmulhrsw m5, m7
25132packuswb m4, m5
25133movu [r0 + 1178 * 16], m4
25134pslldq m1, 2
25135pinsrw m1, [r3 + 7], 0
25136pmaddubsw m4, m1, m6
25137pmulhrsw m4, m7
25138pslldq m3, 2
25139pinsrw m3, [r3 + 15], 0
25140pmaddubsw m5, m3, m6
25141pmulhrsw m5, m7
25142packuswb m4, m5
25143movu [r0 + 1179 * 16], m4
25144
25145; mode 20 [row 14]
25146movu m6, [r5 + 5 * 16]
25147pmaddubsw m4, m0, m6
25148pmulhrsw m4, m7
25149pmaddubsw m5, m2, m6
25150pmulhrsw m5, m7
25151packuswb m4, m5
25152movu [r0 + 1180 * 16], m4
25153pmaddubsw m4, m1, m6
25154pmulhrsw m4, m7
25155pmaddubsw m5, m3, m6
25156pmulhrsw m5, m7
25157packuswb m4, m5
25158movu [r0 + 1181 * 16], m4
25159
25160; mode 20 [row 15]
25161movu m6, [r5 + 16 * 16]
25162pslldq m0, 2
25163pinsrb m0, [r4 + 14], 1
25164pinsrb m0, [r4 + 15], 0
25165pmaddubsw m4, m0, m6
25166pmulhrsw m4, m7
25167pslldq m2, 2
25168pinsrb m2, [r4 + 2], 1
25169pinsrb m2, [r4 + 3], 0
25170pmaddubsw m5, m2, m6
25171pmulhrsw m5, m7
25172packuswb m4, m5
25173movu [r0 + 1182 * 16], m4
25174pslldq m1, 2
25175pinsrw m1, [r3 + 6], 0
25176pmaddubsw m4, m1, m6
25177pmulhrsw m4, m7
25178pslldq m3, 2
25179pinsrw m3, [r3 + 14], 0
25180pmaddubsw m5, m3, m6
25181pmulhrsw m5, m7
25182packuswb m4, m5
25183movu [r0 + 1183 * 16], m4
25184
25185; mode 20 [row 16]
25186movu m6, [r5 + 27 * 16]
25187pslldq m0, 2
25188pinsrb m0, [r4 + 15], 1
25189pinsrb m0, [r4 + 17], 0
25190pmaddubsw m4, m0, m6
25191pmulhrsw m4, m7
25192pslldq m2, 2
25193pinsrb m2, [r4 + 3], 1
25194pinsrb m2, [r4 + 5], 0
25195pmaddubsw m5, m2, m6
25196pmulhrsw m5, m7
25197packuswb m4, m5
25198movu [r0 + 1184 * 16], m4
25199pslldq m1, 2
25200pinsrw m1, [r3 + 5], 0
25201pmaddubsw m4, m1, m6
25202pmulhrsw m4, m7
25203pslldq m3, 2
25204pinsrw m3, [r3 + 13], 0
25205pmaddubsw m5, m3, m6
25206pmulhrsw m5, m7
25207packuswb m4, m5
25208movu [r0 + 1185 * 16], m4
25209
25210; mode 20 [row 17]
25211movu m6, [r5 + 6 * 16]
25212pmaddubsw m4, m0, m6
25213pmulhrsw m4, m7
25214pmaddubsw m5, m2, m6
25215pmulhrsw m5, m7
25216packuswb m4, m5
25217movu [r0 + 1186 * 16], m4
25218pmaddubsw m4, m1, m6
25219pmulhrsw m4, m7
25220pmaddubsw m5, m3, m6
25221pmulhrsw m5, m7
25222packuswb m4, m5
25223movu [r0 + 1187 * 16], m4
25224
25225; mode 20 [row 18]
25226movu m6, [r5 + 17 * 16]
25227pslldq m0, 2
25228pinsrb m0, [r4 + 17], 1
25229pinsrb m0, [r4 + 18], 0
25230pmaddubsw m4, m0, m6
25231pmulhrsw m4, m7
25232pslldq m2, 2
25233pinsrb m2, [r4 + 5], 1
25234pinsrb m2, [r4 + 6], 0
25235pmaddubsw m5, m2, m6
25236pmulhrsw m5, m7
25237packuswb m4, m5
25238movu [r0 + 1188 * 16], m4
25239pslldq m1, 2
25240pinsrw m1, [r3 + 4], 0
25241pmaddubsw m4, m1, m6
25242pmulhrsw m4, m7
25243pslldq m3, 2
25244pinsrw m3, [r3 + 12], 0
25245pmaddubsw m5, m3, m6
25246pmulhrsw m5, m7
25247packuswb m4, m5
25248movu [r0 + 1189 * 16], m4
25249
25250; mode 20 [row 19]
25251movu m6, [r5 + 28 * 16]
25252pslldq m0, 2
25253pinsrb m0, [r4 + 18], 1
25254pinsrb m0, [r4 + 20], 0
25255pmaddubsw m4, m0, m6
25256pmulhrsw m4, m7
25257pslldq m2, 2
25258pinsrb m2, [r4 + 6], 1
25259pinsrb m2, [r4 + 8], 0
25260pmaddubsw m5, m2, m6
25261pmulhrsw m5, m7
25262packuswb m4, m5
25263movu [r0 + 1190 * 16], m4
25264pslldq m1, 2
25265pinsrw m1, [r3 + 3], 0
25266pmaddubsw m4, m1, m6
25267pmulhrsw m4, m7
25268pslldq m3, 2
25269pinsrw m3, [r3 + 11], 0
25270pmaddubsw m5, m3, m6
25271pmulhrsw m5, m7
25272packuswb m4, m5
25273movu [r0 + 1191 * 16], m4
25274
25275; mode 20 [row 20]
25276movu m6, [r5 + 7 * 16]
25277pmaddubsw m4, m0, m6
25278pmulhrsw m4, m7
25279pmaddubsw m5, m2, m6
25280pmulhrsw m5, m7
25281packuswb m4, m5
25282movu [r0 + 1192 * 16], m4
25283pmaddubsw m4, m1, m6
25284pmulhrsw m4, m7
25285pmaddubsw m5, m3, m6
25286pmulhrsw m5, m7
25287packuswb m4, m5
25288movu [r0 + 1193 * 16], m4
25289
25290; mode 20 [row 21]
25291movu m6, [r5 + 18 * 16]
25292pslldq m0, 2
25293pinsrb m0, [r4 + 20], 1
25294pinsrb m0, [r4 + 21], 0
25295pmaddubsw m4, m0, m6
25296pmulhrsw m4, m7
25297pslldq m2, 2
25298pinsrb m2, [r4 + 8], 1
25299pinsrb m2, [r4 + 9], 0
25300pmaddubsw m5, m2, m6
25301pmulhrsw m5, m7
25302packuswb m4, m5
25303movu [r0 + 1194 * 16], m4
25304pslldq m1, 2
25305pinsrw m1, [r3 + 2], 0
25306pmaddubsw m4, m1, m6
25307pmulhrsw m4, m7
25308pslldq m3, 2
25309pinsrw m3, [r3 + 10], 0
25310pmaddubsw m5, m3, m6
25311pmulhrsw m5, m7
25312packuswb m4, m5
25313movu [r0 + 1195 * 16], m4
25314
25315; mode 20 [row 22]
25316movu m6, [r5 + 29 * 16]
25317pslldq m0, 2
25318pinsrb m0, [r4 + 21], 1
25319pinsrb m0, [r4 + 23], 0
25320pmaddubsw m4, m0, m6
25321pmulhrsw m4, m7
25322pslldq m2, 2
25323pinsrb m2, [r4 + 9], 1
25324pinsrb m2, [r4 + 11], 0
25325pmaddubsw m5, m2, m6
25326pmulhrsw m5, m7
25327packuswb m4, m5
25328movu [r0 + 1196 * 16], m4
25329pslldq m1, 2
25330pinsrw m1, [r3 + 1], 0
25331pmaddubsw m4, m1, m6
25332pmulhrsw m4, m7
25333pslldq m3, 2
25334pinsrw m3, [r3 + 9], 0
25335pmaddubsw m5, m3, m6
25336pmulhrsw m5, m7
25337packuswb m4, m5
25338movu [r0 + 1197 * 16], m4
25339
25340; mode 20 [row 23]
25341movu m6, [r5 + 8 * 16]
25342pmaddubsw m4, m0, m6
25343pmulhrsw m4, m7
25344pmaddubsw m5, m2, m6
25345pmulhrsw m5, m7
25346packuswb m4, m5
25347movu [r0 + 1198 * 16], m4
25348pmaddubsw m4, m1, m6
25349pmulhrsw m4, m7
25350pmaddubsw m5, m3, m6
25351pmulhrsw m5, m7
25352packuswb m4, m5
25353movu [r0 + 1199 * 16], m4
25354
25355; mode 20 [row 24]
25356movu m6, [r5 + 19 * 16]
25357pslldq m0, 2
25358pinsrb m0, [r4 + 23], 1
25359pinsrb m0, [r4 + 24], 0
25360pmaddubsw m4, m0, m6
25361pmulhrsw m4, m7
25362pslldq m2, 2
25363pinsrb m2, [r4 + 11], 1
25364pinsrb m2, [r4 + 12], 0
25365pmaddubsw m5, m2, m6
25366pmulhrsw m5, m7
25367packuswb m4, m5
25368movu [r0 + 1200 * 16], m4
25369pslldq m1, 2
25370pinsrw m1, [r3 + 0], 0
25371pmaddubsw m4, m1, m6
25372pmulhrsw m4, m7
25373pslldq m3, 2
25374pinsrw m3, [r3 + 8], 0
25375pmaddubsw m5, m3, m6
25376pmulhrsw m5, m7
25377packuswb m4, m5
25378movu [r0 + 1201 * 16], m4
25379
25380; mode 20 [row 25]
25381movu m6, [r5 + 30 * 16]
25382pslldq m0, 2
25383pinsrb m0, [r4 + 24], 1
25384pinsrb m0, [r4 + 26], 0
25385pmaddubsw m4, m0, m6
25386pmulhrsw m4, m7
25387pslldq m2, 2
25388pinsrb m2, [r4 + 12], 1
25389pinsrb m2, [r4 + 14], 0
25390pmaddubsw m5, m2, m6
25391pmulhrsw m5, m7
25392packuswb m4, m5
25393movu [r0 + 1202 * 16], m4
25394pslldq m1, 2
25395pinsrb m1, [r4 + 0], 1
25396pinsrb m1, [r4 + 2], 0
25397pmaddubsw m4, m1, m6
25398pmulhrsw m4, m7
25399pslldq m3, 2
25400pinsrw m3, [r3 + 7], 0
25401pmaddubsw m5, m3, m6
25402pmulhrsw m5, m7
25403packuswb m4, m5
25404movu [r0 + 1203 * 16], m4
25405
25406; mode 20 [row 26]
25407movu m6, [r5 + 9 * 16]
25408pmaddubsw m4, m0, m6
25409pmulhrsw m4, m7
25410pmaddubsw m5, m2, m6
25411pmulhrsw m5, m7
25412packuswb m4, m5
25413movu [r0 + 1204 * 16], m4
25414pmaddubsw m4, m1, m6
25415pmulhrsw m4, m7
25416pmaddubsw m5, m3, m6
25417pmulhrsw m5, m7
25418packuswb m4, m5
25419movu [r0 + 1205 * 16], m4
25420
25421; mode 20 [row 27]
25422movu m6, [r5 + 20 * 16]
25423pslldq m0, 2
25424pinsrb m0, [r4 + 26], 1
25425pinsrb m0, [r4 + 27], 0
25426pmaddubsw m4, m0, m6
25427pmulhrsw m4, m7
25428pslldq m2, 2
25429pinsrb m2, [r4 + 14], 1
25430pinsrb m2, [r4 + 15], 0
25431pmaddubsw m5, m2, m6
25432pmulhrsw m5, m7
25433packuswb m4, m5
25434movu [r0 + 1206 * 16], m4
25435pslldq m1, 2
25436pinsrb m1, [r4 + 2], 1
25437pinsrb m1, [r4 + 3], 0
25438pmaddubsw m4, m1, m6
25439pmulhrsw m4, m7
25440pslldq m3, 2
25441pinsrw m3, [r3 + 6], 0
25442pmaddubsw m5, m3, m6
25443pmulhrsw m5, m7
25444packuswb m4, m5
25445movu [r0 + 1207 * 16], m4
25446
25447; mode 20 [row 28]
25448movu m6, [r5 + 31 * 16]
25449pslldq m0, 2
25450pinsrb m0, [r4 + 27], 1
25451pinsrb m0, [r4 + 29], 0
25452pmaddubsw m4, m0, m6
25453pmulhrsw m4, m7
25454pslldq m2, 2
25455pinsrb m2, [r4 + 15], 1
25456pinsrb m2, [r4 + 17], 0
25457pmaddubsw m5, m2, m6
25458pmulhrsw m5, m7
25459packuswb m4, m5
25460movu [r0 + 1208 * 16], m4
25461pslldq m1, 2
25462pinsrb m1, [r4 + 3], 1
25463pinsrb m1, [r4 + 5], 0
25464pmaddubsw m4, m1, m6
25465pmulhrsw m4, m7
25466pslldq m3, 2
25467pinsrw m3, [r3 + 5], 0
25468pmaddubsw m5, m3, m6
25469pmulhrsw m5, m7
25470packuswb m4, m5
25471movu [r0 + 1209 * 16], m4
25472
25473; mode 20 [row 29]
25474movu m6, [r5 + 10 * 16]
25475pmaddubsw m4, m0, m6
25476pmulhrsw m4, m7
25477pmaddubsw m5, m2, m6
25478pmulhrsw m5, m7
25479packuswb m4, m5
25480movu [r0 + 1210 * 16], m4
25481pmaddubsw m4, m1, m6
25482pmulhrsw m4, m7
25483pmaddubsw m5, m3, m6
25484pmulhrsw m5, m7
25485packuswb m4, m5
25486movu [r0 + 1211 * 16], m4
25487
25488; mode 20 [row 30]
25489movu m6, [r5 + 21 * 16]
25490pslldq m0, 2
25491pinsrb m0, [r4 + 29], 1
25492pinsrb m0, [r4 + 30], 0
25493pmaddubsw m4, m0, m6
25494pmulhrsw m4, m7
25495pslldq m2, 2
25496pinsrb m2, [r4 + 17], 1
25497pinsrb m2, [r4 + 18], 0
25498pmaddubsw m5, m2, m6
25499pmulhrsw m5, m7
25500packuswb m4, m5
25501movu [r0 + 1212 * 16], m4
25502pslldq m1, 2
25503pinsrb m1, [r4 + 5], 1
25504pinsrb m1, [r4 + 6], 0
25505pmaddubsw m4, m1, m6
25506pmulhrsw m4, m7
25507pslldq m3, 2
25508pinsrw m3, [r3 + 4], 0
25509pmaddubsw m5, m3, m6
25510pmulhrsw m5, m7
25511packuswb m4, m5
25512movu [r0 + 1213 * 16], m4
25513
25514; mode20 [row 31]
25515pshufb m5, m0, [tab_S2]
25516movh [r0 + 1214 * 16], m5
25517pshufb m5, m2, [tab_S2]
25518movh [r0 + 1214 * 16 + 8], m5
25519pshufb m5, m1, [tab_S2]
25520movh [r0 + 1215 * 16], m5
25521pshufb m5, m3, [tab_S2]
25522movh [r0 + 1215 * 16 + 8], m5
25523
25524; mode 21 [row 0]
25525movu m6, [r5 + 15 * 16]
25526movu m0, [r3 ]
25527movu m1, [r3 + 1 ]
25528punpcklbw m0, m1
25529pmaddubsw m1, m0, m6
25530pmulhrsw m1, m7
25531movu m2, [r3 + 8]
25532movu m3, [r3 + 9]
25533punpcklbw m2, m3
25534pmaddubsw m3, m2, m6
25535pmulhrsw m3, m7
25536packuswb m1, m3
25537movu [r0 + 1216 * 16], m1
25538
25539movu m1, [r3 + 16]
25540movu m3, [r3 + 17]
25541punpcklbw m1, m3
25542pmaddubsw m4, m1, m6
25543pmulhrsw m4, m7
25544movu m3, [r3 + 24]
25545movu m5, [r3 + 25]
25546punpcklbw m3, m5
25547pmaddubsw m5, m3, m6
25548pmulhrsw m5, m7
25549packuswb m4, m5
25550movu [r0 + 1217 * 16], m4
25551
25552; mode 21 [row 1]
25553movu m6, [r5 + 30 * 16]
25554pslldq m0, 2
25555pinsrb m0, [r4 + 0], 1
25556pinsrb m0, [r4 + 2], 0
25557pmaddubsw m4, m0, m6
25558pmulhrsw m4, m7
25559pslldq m2, 2
25560pinsrw m2, [r3 + 7], 0
25561pmaddubsw m5, m2, m6
25562pmulhrsw m5, m7
25563packuswb m4, m5
25564movu [r0 + 1218 * 16], m4
25565pslldq m1, 2
25566pinsrw m1, [r3 + 15], 0
25567pmaddubsw m4, m1, m6
25568pmulhrsw m4, m7
25569pslldq m3, 2
25570pinsrw m3, [r3 + 23], 0
25571pmaddubsw m5, m3, m6
25572pmulhrsw m5, m7
25573packuswb m4, m5
25574movu [r0 + 1219 * 16], m4
25575
25576; mode 21 [row 2]
25577movu m6, [r5 + 13 * 16]
25578pmaddubsw m4, m0, m6
25579pmulhrsw m4, m7
25580pmaddubsw m5, m2, m6
25581pmulhrsw m5, m7
25582packuswb m4, m5
25583movu [r0 + 1220 * 16], m4
25584pmaddubsw m4, m1, m6
25585pmulhrsw m4, m7
25586pmaddubsw m5, m3, m6
25587pmulhrsw m5, m7
25588packuswb m4, m5
25589movu [r0 + 1221 * 16], m4
25590
25591; mode 21 [row 3]
25592movu m6, [r5 + 28 * 16]
25593pslldq m0, 2
25594pinsrb m0, [r4 + 2], 1
25595pinsrb m0, [r4 + 4], 0
25596pmaddubsw m4, m0, m6
25597pmulhrsw m4, m7
25598pslldq m2, 2
25599pinsrw m2, [r3 + 6], 0
25600pmaddubsw m5, m2, m6
25601pmulhrsw m5, m7
25602packuswb m4, m5
25603movu [r0 + 1222 * 16], m4
25604pslldq m1, 2
25605pinsrw m1, [r3 + 14], 0
25606pmaddubsw m4, m1, m6
25607pmulhrsw m4, m7
25608pslldq m3, 2
25609pinsrw m3, [r3 + 22], 0
25610pmaddubsw m5, m3, m6
25611pmulhrsw m5, m7
25612packuswb m4, m5
25613movu [r0 + 1223 * 16], m4
25614
25615; mode 21 [row 4]
25616movu m6, [r5 + 11 * 16]
25617pmaddubsw m4, m0, m6
25618pmulhrsw m4, m7
25619pmaddubsw m5, m2, m6
25620pmulhrsw m5, m7
25621packuswb m4, m5
25622movu [r0 + 1224 * 16], m4
25623pmaddubsw m4, m1, m6
25624pmulhrsw m4, m7
25625pmaddubsw m5, m3, m6
25626pmulhrsw m5, m7
25627packuswb m4, m5
25628movu [r0 + 1225 * 16], m4
25629
25630; mode 21 [row 5]
25631movu m6, [r5 + 26 * 16]
25632pslldq m0, 2
25633pinsrb m0, [r4 + 4], 1
25634pinsrb m0, [r4 + 6], 0
25635pmaddubsw m4, m0, m6
25636pmulhrsw m4, m7
25637pslldq m2, 2
25638pinsrw m2, [r3 + 5], 0
25639pmaddubsw m5, m2, m6
25640pmulhrsw m5, m7
25641packuswb m4, m5
25642movu [r0 + 1226 * 16], m4
25643pslldq m1, 2
25644pinsrw m1, [r3 + 13], 0
25645pmaddubsw m4, m1, m6
25646pmulhrsw m4, m7
25647pslldq m3, 2
25648pinsrw m3, [r3 + 21], 0
25649pmaddubsw m5, m3, m6
25650pmulhrsw m5, m7
25651packuswb m4, m5
25652movu [r0 + 1227 * 16], m4
25653
25654; mode 21 [row 6]
25655movu m6, [r5 + 9 * 16]
25656pmaddubsw m4, m0, m6
25657pmulhrsw m4, m7
25658pmaddubsw m5, m2, m6
25659pmulhrsw m5, m7
25660packuswb m4, m5
25661movu [r0 + 1228 * 16], m4
25662pmaddubsw m4, m1, m6
25663pmulhrsw m4, m7
25664pmaddubsw m5, m3, m6
25665pmulhrsw m5, m7
25666packuswb m4, m5
25667movu [r0 + 1229 * 16], m4
25668
25669; mode 21 [row 7]
25670movu m6, [r5 + 24 * 16]
25671pslldq m0, 2
25672pinsrb m0, [r4 + 6], 1
25673pinsrb m0, [r4 + 8], 0
25674pmaddubsw m4, m0, m6
25675pmulhrsw m4, m7
25676pslldq m2, 2
25677pinsrw m2, [r3 + 4], 0
25678pmaddubsw m5, m2, m6
25679pmulhrsw m5, m7
25680packuswb m4, m5
25681movu [r0 + 1230 * 16], m4
25682pslldq m1, 2
25683pinsrw m1, [r3 + 12], 0
25684pmaddubsw m4, m1, m6
25685pmulhrsw m4, m7
25686pslldq m3, 2
25687pinsrw m3, [r3 + 20], 0
25688pmaddubsw m5, m3, m6
25689pmulhrsw m5, m7
25690packuswb m4, m5
25691movu [r0 + 1231 * 16], m4
25692
25693; mode 21 [row 8]
25694movu m6, [r5 + 7 * 16]
25695pmaddubsw m4, m0, m6
25696pmulhrsw m4, m7
25697pmaddubsw m5, m2, m6
25698pmulhrsw m5, m7
25699packuswb m4, m5
25700movu [r0 + 1232 * 16], m4
25701pmaddubsw m4, m1, m6
25702pmulhrsw m4, m7
25703pmaddubsw m5, m3, m6
25704pmulhrsw m5, m7
25705packuswb m4, m5
25706movu [r0 + 1233 * 16], m4
25707
25708; mode 21 [row 9]
25709movu m6, [r5 + 22 * 16]
25710pslldq m0, 2
25711pinsrb m0, [r4 + 8], 1
25712pinsrb m0, [r4 + 9], 0
25713pmaddubsw m4, m0, m6
25714pmulhrsw m4, m7
25715pslldq m2, 2
25716pinsrw m2, [r3 + 3], 0
25717pmaddubsw m5, m2, m6
25718pmulhrsw m5, m7
25719packuswb m4, m5
25720movu [r0 + 1234 * 16], m4
25721pslldq m1, 2
25722pinsrw m1, [r3 + 11], 0
25723pmaddubsw m4, m1, m6
25724pmulhrsw m4, m7
25725pslldq m3, 2
25726pinsrw m3, [r3 + 19], 0
25727pmaddubsw m5, m3, m6
25728pmulhrsw m5, m7
25729packuswb m4, m5
25730movu [r0 + 1235 * 16], m4
25731
25732; mode 21 [row 10]
25733movu m6, [r5 + 5 * 16]
25734pmaddubsw m4, m0, m6
25735pmulhrsw m4, m7
25736pmaddubsw m5, m2, m6
25737pmulhrsw m5, m7
25738packuswb m4, m5
25739movu [r0 + 1236 * 16], m4
25740pmaddubsw m4, m1, m6
25741pmulhrsw m4, m7
25742pmaddubsw m5, m3, m6
25743pmulhrsw m5, m7
25744packuswb m4, m5
25745movu [r0 + 1237 * 16], m4
25746
25747; mode 21 [row 11]
25748movu m6, [r5 + 20 * 16]
25749pslldq m0, 2
25750pinsrb m0, [r4 + 9], 1
25751pinsrb m0, [r4 + 11], 0
25752pmaddubsw m4, m0, m6
25753pmulhrsw m4, m7
25754pslldq m2, 2
25755pinsrw m2, [r3 + 2], 0
25756pmaddubsw m5, m2, m6
25757pmulhrsw m5, m7
25758packuswb m4, m5
25759movu [r0 + 1238 * 16], m4
25760pslldq m1, 2
25761pinsrw m1, [r3 + 10], 0
25762pmaddubsw m4, m1, m6
25763pmulhrsw m4, m7
25764pslldq m3, 2
25765pinsrw m3, [r3 + 18], 0
25766pmaddubsw m5, m3, m6
25767pmulhrsw m5, m7
25768packuswb m4, m5
25769movu [r0 + 1239 * 16], m4
25770
25771; mode 21 [row 12]
25772movu m6, [r5 + 3 * 16]
25773pmaddubsw m4, m0, m6
25774pmulhrsw m4, m7
25775pmaddubsw m5, m2, m6
25776pmulhrsw m5, m7
25777packuswb m4, m5
25778movu [r0 + 1240 * 16], m4
25779pmaddubsw m4, m1, m6
25780pmulhrsw m4, m7
25781pmaddubsw m5, m3, m6
25782pmulhrsw m5, m7
25783packuswb m4, m5
25784movu [r0 + 1241 * 16], m4
25785
25786; mode 21 [row 13]
25787movu m6, [r5 + 18 * 16]
25788pslldq m0, 2
25789pinsrb m0, [r4 + 11], 1
25790pinsrb m0, [r4 + 13], 0
25791pmaddubsw m4, m0, m6
25792pmulhrsw m4, m7
25793pslldq m2, 2
25794pinsrw m2, [r3 + 1], 0
25795pmaddubsw m5, m2, m6
25796pmulhrsw m5, m7
25797packuswb m4, m5
25798movu [r0 + 1242 * 16], m4
25799pslldq m1, 2
25800pinsrw m1, [r3 + 9], 0
25801pmaddubsw m4, m1, m6
25802pmulhrsw m4, m7
25803pslldq m3, 2
25804pinsrw m3, [r3 + 17], 0
25805pmaddubsw m5, m3, m6
25806pmulhrsw m5, m7
25807packuswb m4, m5
25808movu [r0 + 1243 * 16], m4
25809
25810; mode 21 [row 14]
25811movu m6, [r5 + 1 * 16]
25812pmaddubsw m4, m0, m6
25813pmulhrsw m4, m7
25814pmaddubsw m5, m2, m6
25815pmulhrsw m5, m7
25816packuswb m4, m5
25817movu [r0 + 1244 * 16], m4
25818pmaddubsw m4, m1, m6
25819pmulhrsw m4, m7
25820pmaddubsw m5, m3, m6
25821pmulhrsw m5, m7
25822packuswb m4, m5
25823movu [r0 + 1245 * 16], m4
25824
25825; mode 21 [row 15]
25826movu m6, [r5 + 16 * 16]
25827pslldq m0, 2
25828pinsrb m0, [r4 + 13], 1
25829pinsrb m0, [r4 + 15], 0
25830pmaddubsw m4, m0, m6
25831pmulhrsw m4, m7
25832pslldq m2, 2
25833pinsrw m2, [r3 + 0], 0
25834pmaddubsw m5, m2, m6
25835pmulhrsw m5, m7
25836packuswb m4, m5
25837movu [r0 + 1246 * 16], m4
25838pslldq m1, 2
25839pinsrw m1, [r3 + 8], 0
25840pmaddubsw m4, m1, m6
25841pmulhrsw m4, m7
25842pslldq m3, 2
25843pinsrw m3, [r3 + 16], 0
25844pmaddubsw m5, m3, m6
25845pmulhrsw m5, m7
25846packuswb m4, m5
25847movu [r0 + 1247 * 16], m4
25848
25849; mode 21 [row 16]
25850movu m6, [r5 + 31 * 16]
25851pslldq m0, 2
25852pinsrb m0, [r4 + 15], 1
25853pinsrb m0, [r4 + 17], 0
25854pmaddubsw m4, m0, m6
25855pmulhrsw m4, m7
25856pslldq m2, 2
25857pinsrb m2, [r4 + 0], 1
25858pinsrb m2, [r4 + 2], 0
25859pmaddubsw m5, m2, m6
25860pmulhrsw m5, m7
25861packuswb m4, m5
25862movu [r0 + 1248 * 16], m4
25863pslldq m1, 2
25864pinsrw m1, [r3 + 7], 0
25865pmaddubsw m4, m1, m6
25866pmulhrsw m4, m7
25867pslldq m3, 2
25868pinsrw m3, [r3 + 15], 0
25869pmaddubsw m5, m3, m6
25870pmulhrsw m5, m7
25871packuswb m4, m5
25872movu [r0 + 1249 * 16], m4
25873
25874; mode 21 [row 17]
25875movu m6, [r5 + 14 * 16]
25876pmaddubsw m4, m0, m6
25877pmulhrsw m4, m7
25878pmaddubsw m5, m2, m6
25879pmulhrsw m5, m7
25880packuswb m4, m5
25881movu [r0 + 1250 * 16], m4
25882pmaddubsw m4, m1, m6
25883pmulhrsw m4, m7
25884pmaddubsw m5, m3, m6
25885pmulhrsw m5, m7
25886packuswb m4, m5
25887movu [r0 + 1251 * 16], m4
25888
25889; mode 21 [row 18]
25890movu m6, [r5 + 29 * 16]
25891pslldq m0, 2
25892pinsrb m0, [r4 + 17], 1
25893pinsrb m0, [r4 + 19], 0
25894pmaddubsw m4, m0, m6
25895pmulhrsw m4, m7
25896pslldq m2, 2
25897pinsrb m2, [r4 + 2], 1
25898pinsrb m2, [r4 + 4], 0
25899pmaddubsw m5, m2, m6
25900pmulhrsw m5, m7
25901packuswb m4, m5
25902movu [r0 + 1252 * 16], m4
25903pslldq m1, 2
25904pinsrb m1, [r3 + 7], 1
25905pinsrb m1, [r3 + 6], 0
25906pmaddubsw m4, m1, m6
25907pmulhrsw m4, m7
25908pslldq m3, 2
25909pinsrb m3, [r3 + 15], 1
25910pinsrb m3, [r3 + 14], 0
25911pmaddubsw m5, m3, m6
25912pmulhrsw m5, m7
25913packuswb m4, m5
25914movu [r0 + 1253 * 16], m4
25915
25916; mode 21 [row 19]
25917movu m6, [r5 + 12 * 16]
25918pmaddubsw m4, m0, m6
25919pmulhrsw m4, m7
25920pmaddubsw m5, m2, m6
25921pmulhrsw m5, m7
25922packuswb m4, m5
25923movu [r0 + 1254 * 16], m4
25924pmaddubsw m4, m1, m6
25925pmulhrsw m4, m7
25926pmaddubsw m5, m3, m6
25927pmulhrsw m5, m7
25928packuswb m4, m5
25929movu [r0 + 1255 * 16], m4
25930
25931; mode 21 [row 20]
25932movu m6, [r5 + 27 * 16]
25933pslldq m0, 2
25934pinsrb m0, [r4 + 19], 1
25935pinsrb m0, [r4 + 21], 0
25936pmaddubsw m4, m0, m6
25937pmulhrsw m4, m7
25938pslldq m2, 2
25939pinsrb m2, [r4 + 4], 1
25940pinsrb m2, [r4 + 6], 0
25941pmaddubsw m5, m2, m6
25942pmulhrsw m5, m7
25943packuswb m4, m5
25944movu [r0 + 1256 * 16], m4
25945pslldq m1, 2
25946pinsrw m1, [r3 + 5], 0
25947pmaddubsw m4, m1, m6
25948pmulhrsw m4, m7
25949pslldq m3, 2
25950pinsrw m3, [r3 + 13], 0
25951pmaddubsw m5, m3, m6
25952pmulhrsw m5, m7
25953packuswb m4, m5
25954movu [r0 + 1257 * 16], m4
25955
25956; mode 21 [row 21]
25957movu m6, [r5 + 10 * 16]
25958pmaddubsw m4, m0, m6
25959pmulhrsw m4, m7
25960pmaddubsw m5, m2, m6
25961pmulhrsw m5, m7
25962packuswb m4, m5
25963movu [r0 + 1258 * 16], m4
25964pmaddubsw m4, m1, m6
25965pmulhrsw m4, m7
25966pmaddubsw m5, m3, m6
25967pmulhrsw m5, m7
25968packuswb m4, m5
25969movu [r0 + 1259 * 16], m4
25970
25971; mode 21 [row 22]
25972movu m6, [r5 + 25 * 16]
25973pslldq m0, 2
25974pinsrb m0, [r4 + 21], 1
25975pinsrb m0, [r4 + 23], 0
25976pmaddubsw m4, m0, m6
25977pmulhrsw m4, m7
25978pslldq m2, 2
25979pinsrb m2, [r4 + 6], 1
25980pinsrb m2, [r4 + 8], 0
25981pmaddubsw m5, m2, m6
25982pmulhrsw m5, m7
25983packuswb m4, m5
25984movu [r0 + 1260 * 16], m4
25985pslldq m1, 2
25986pinsrw m1, [r3 + 4], 0
25987pmaddubsw m4, m1, m6
25988pmulhrsw m4, m7
25989pslldq m3, 2
25990pinsrw m3, [r3 + 12], 0
25991pmaddubsw m5, m3, m6
25992pmulhrsw m5, m7
25993packuswb m4, m5
25994movu [r0 + 1261 * 16], m4
25995
25996; mode 21 [row 23]
25997movu m6, [r5 + 8 * 16]
25998pmaddubsw m4, m0, m6
25999pmulhrsw m4, m7
26000pmaddubsw m5, m2, m6
26001pmulhrsw m5, m7
26002packuswb m4, m5
26003movu [r0 + 1262 * 16], m4
26004pmaddubsw m4, m1, m6
26005pmulhrsw m4, m7
26006pmaddubsw m5, m3, m6
26007pmulhrsw m5, m7
26008packuswb m4, m5
26009movu [r0 + 1263 * 16], m4
26010
26011; mode 21 [row 24]
26012movu m6, [r5 + 23 * 16]
26013pslldq m0, 2
26014pinsrb m0, [r4 + 23], 1
26015pinsrb m0, [r4 + 24], 0
26016pmaddubsw m4, m0, m6
26017pmulhrsw m4, m7
26018pslldq m2, 2
26019pinsrb m2, [r4 + 8], 1
26020pinsrb m2, [r4 + 9], 0
26021pmaddubsw m5, m2, m6
26022pmulhrsw m5, m7
26023packuswb m4, m5
26024movu [r0 + 1264 * 16], m4
26025pslldq m1, 2
26026pinsrw m1, [r3 + 3], 0
26027pmaddubsw m4, m1, m6
26028pmulhrsw m4, m7
26029pslldq m3, 2
26030pinsrw m3, [r3 + 11], 0
26031pmaddubsw m5, m3, m6
26032pmulhrsw m5, m7
26033packuswb m4, m5
26034movu [r0 + 1265 * 16], m4
26035
26036; mode 21 [row 25]
26037movu m6, [r5 + 6 * 16]
26038pmaddubsw m4, m0, m6
26039pmulhrsw m4, m7
26040pmaddubsw m5, m2, m6
26041pmulhrsw m5, m7
26042packuswb m4, m5
26043movu [r0 + 1266 * 16], m4
26044pmaddubsw m4, m1, m6
26045pmulhrsw m4, m7
26046pmaddubsw m5, m3, m6
26047pmulhrsw m5, m7
26048packuswb m4, m5
26049movu [r0 + 1267 * 16], m4
26050
26051; mode 21 [row 26]
26052movu m6, [r5 + 21 * 16]
26053pslldq m0, 2
26054pinsrb m0, [r4 + 24], 1
26055pinsrb m0, [r4 + 26], 0
26056pmaddubsw m4, m0, m6
26057pmulhrsw m4, m7
26058pslldq m2, 2
26059pinsrb m2, [r4 + 9], 1
26060pinsrb m2, [r4 + 11], 0
26061pmaddubsw m5, m2, m6
26062pmulhrsw m5, m7
26063packuswb m4, m5
26064movu [r0 + 1268 * 16], m4
26065pslldq m1, 2
26066pinsrw m1, [r3 + 2], 0
26067pmaddubsw m4, m1, m6
26068pmulhrsw m4, m7
26069pslldq m3, 2
26070pinsrw m3, [r3 + 10], 0
26071pmaddubsw m5, m3, m6
26072pmulhrsw m5, m7
26073packuswb m4, m5
26074movu [r0 + 1269 * 16], m4
26075
26076; mode 21 [row 27]
26077movu m6, [r5 + 4 * 16]
26078pmaddubsw m4, m0, m6
26079pmulhrsw m4, m7
26080pmaddubsw m5, m2, m6
26081pmulhrsw m5, m7
26082packuswb m4, m5
26083movu [r0 + 1270 * 16], m4
26084pmaddubsw m4, m1, m6
26085pmulhrsw m4, m7
26086pmaddubsw m5, m3, m6
26087pmulhrsw m5, m7
26088packuswb m4, m5
26089movu [r0 + 1271 * 16], m4
26090
26091; mode 21 [row 28]
26092movu m6, [r5 + 19 * 16]
26093pslldq m0, 2
26094pinsrb m0, [r4 + 26], 1
26095pinsrb m0, [r4 + 28], 0
26096pmaddubsw m4, m0, m6
26097pmulhrsw m4, m7
26098pslldq m2, 2
26099pinsrb m2, [r4 + 11], 1
26100pinsrb m2, [r4 + 13], 0
26101pmaddubsw m5, m2, m6
26102pmulhrsw m5, m7
26103packuswb m4, m5
26104movu [r0 + 1272 * 16], m4
26105pslldq m1, 2
26106pinsrw m1, [r3 + 1], 0
26107pmaddubsw m4, m1, m6
26108pmulhrsw m4, m7
26109pslldq m3, 2
26110pinsrw m3, [r3 + 9], 0
26111pmaddubsw m5, m3, m6
26112pmulhrsw m5, m7
26113packuswb m4, m5
26114movu [r0 + 1273 * 16], m4
26115
26116; mode 21 [row 29]
26117movu m6, [r5 + 2 * 16]
26118pmaddubsw m4, m0, m6
26119pmulhrsw m4, m7
26120pmaddubsw m5, m2, m6
26121pmulhrsw m5, m7
26122packuswb m4, m5
26123movu [r0 + 1274 * 16], m4
26124pmaddubsw m4, m1, m6
26125pmulhrsw m4, m7
26126pmaddubsw m5, m3, m6
26127pmulhrsw m5, m7
26128packuswb m4, m5
26129movu [r0 + 1275 * 16], m4
26130
26131; mode 21 [row 30]
26132movu m6, [r5 + 17 * 16]
26133pslldq m0, 2
26134pinsrb m0, [r4 + 28], 1
26135pinsrb m0, [r4 + 30], 0
26136pmaddubsw m4, m0, m6
26137pmulhrsw m4, m7
26138pslldq m2, 2
26139pinsrb m2, [r4 + 13], 1
26140pinsrb m2, [r4 + 15], 0
26141pmaddubsw m5, m2, m6
26142pmulhrsw m5, m7
26143packuswb m4, m5
26144movu [r0 + 1276 * 16], m4
26145pslldq m1, 2
26146pinsrw m1, [r3 + 0], 0
26147pmaddubsw m4, m1, m6
26148pmulhrsw m4, m7
26149pslldq m3, 2
26150pinsrw m3, [r3 + 8], 0
26151pmaddubsw m5, m3, m6
26152pmulhrsw m5, m7
26153packuswb m4, m5
26154movu [r0 + 1277 * 16], m4
26155
26156; mode21 [row 31]
26157pshufb m5, m0, [tab_S2]
26158movh [r0 + 1278 * 16], m5
26159pshufb m5, m2, [tab_S2]
26160movh [r0 + 1278 * 16 + 8], m5
26161pshufb m5, m1, [tab_S2]
26162movh [r0 + 1279 * 16], m5
26163pshufb m5, m3, [tab_S2]
26164movh [r0 + 1279 * 16 + 8], m5
26165
26166; mode 22 [row 0]
26167movu m6, [r5 + 19 * 16]
26168movu m0, [r3 ]
26169movu m1, [r3 + 1 ]
26170punpcklbw m0, m1
26171pmaddubsw m1, m0, m6
26172pmulhrsw m1, m7
26173movu m2, [r3 + 8]
26174movu m3, [r3 + 9]
26175punpcklbw m2, m3
26176pmaddubsw m3, m2, m6
26177pmulhrsw m3, m7
26178packuswb m1, m3
26179movu [r0 + 1280 * 16], m1
26180
26181movu m1, [r3 + 16]
26182movu m3, [r3 + 17]
26183punpcklbw m1, m3
26184pmaddubsw m4, m1, m6
26185pmulhrsw m4, m7
26186movu m3, [r3 + 24]
26187movu m5, [r3 + 25]
26188punpcklbw m3, m5
26189pmaddubsw m5, m3, m6
26190pmulhrsw m5, m7
26191packuswb m4, m5
26192movu [r0 + 1281 * 16], m4
26193
26194; mode 22 [row 1]
26195movu m6, [r5 + 6 * 16]
26196pmaddubsw m4, m0, m6
26197pmulhrsw m4, m7
26198pmaddubsw m5, m2, m6
26199pmulhrsw m5, m7
26200packuswb m4, m5
26201movu [r0 + 1282 * 16], m4
26202pmaddubsw m4, m1, m6
26203pmulhrsw m4, m7
26204pmaddubsw m5, m3, m6
26205pmulhrsw m5, m7
26206packuswb m4, m5
26207movu [r0 + 1283 * 16], m4
26208
26209; mode 22 [row 2]
26210movu m6, [r5 + 25 * 16]
26211pslldq m0, 2
26212pinsrb m0, [r4 + 0], 1
26213pinsrb m0, [r4 + 2], 0
26214pmaddubsw m4, m0, m6
26215pmulhrsw m4, m7
26216pslldq m2, 2
26217pinsrw m2, [r3 + 7], 0
26218pmaddubsw m5, m2, m6
26219pmulhrsw m5, m7
26220packuswb m4, m5
26221movu [r0 + 1284 * 16], m4
26222pslldq m1, 2
26223pinsrw m1, [r3 + 15], 0
26224pmaddubsw m4, m1, m6
26225pmulhrsw m4, m7
26226pslldq m3, 2
26227pinsrw m3, [r3 + 23], 0
26228pmaddubsw m5, m3, m6
26229pmulhrsw m5, m7
26230packuswb m4, m5
26231movu [r0 + 1285 * 16], m4
26232
26233; mode 22 [row 3]
26234movu m6, [r5 + 12 * 16]
26235pmaddubsw m4, m0, m6
26236pmulhrsw m4, m7
26237pmaddubsw m5, m2, m6
26238pmulhrsw m5, m7
26239packuswb m4, m5
26240movu [r0 + 1286 * 16], m4
26241pmaddubsw m4, m1, m6
26242pmulhrsw m4, m7
26243pmaddubsw m5, m3, m6
26244pmulhrsw m5, m7
26245packuswb m4, m5
26246movu [r0 + 1287 * 16], m4
26247
26248; mode 22 [row 4]
26249movu m6, [r5 + 31 * 16]
26250pslldq m0, 2
26251pinsrb m0, [r4 + 2], 1
26252pinsrb m0, [r4 + 5], 0
26253pmaddubsw m4, m0, m6
26254pmulhrsw m4, m7
26255pslldq m2, 2
26256pinsrw m2, [r3 + 6], 0
26257pmaddubsw m5, m2, m6
26258pmulhrsw m5, m7
26259packuswb m4, m5
26260movu [r0 + 1288 * 16], m4
26261pslldq m1, 2
26262pinsrw m1, [r3 + 14], 0
26263pmaddubsw m4, m1, m6
26264pmulhrsw m4, m7
26265pslldq m3, 2
26266pinsrw m3, [r3 + 22], 0
26267pmaddubsw m5, m3, m6
26268pmulhrsw m5, m7
26269packuswb m4, m5
26270movu [r0 + 1289 * 16], m4
26271
26272; mode 22 [row 5]
26273movu m6, [r5 + 18 * 16]
26274pmaddubsw m4, m0, m6
26275pmulhrsw m4, m7
26276pmaddubsw m5, m2, m6
26277pmulhrsw m5, m7
26278packuswb m4, m5
26279movu [r0 + 1290 * 16], m4
26280pmaddubsw m4, m1, m6
26281pmulhrsw m4, m7
26282pmaddubsw m5, m3, m6
26283pmulhrsw m5, m7
26284packuswb m4, m5
26285movu [r0 + 1291 * 16], m4
26286
26287; mode 22 [row 6]
26288movu m6, [r5 + 5 * 16]
26289pmaddubsw m4, m0, m6
26290pmulhrsw m4, m7
26291pmaddubsw m5, m2, m6
26292pmulhrsw m5, m7
26293packuswb m4, m5
26294movu [r0 + 1292 * 16], m4
26295pmaddubsw m4, m1, m6
26296pmulhrsw m4, m7
26297pmaddubsw m5, m3, m6
26298pmulhrsw m5, m7
26299packuswb m4, m5
26300movu [r0 + 1293 * 16], m4
26301
26302; mode 22 [row 7]
26303movu m6, [r5 + 24 * 16]
26304pslldq m0, 2
26305pinsrb m0, [r4 + 5], 1
26306pinsrb m0, [r4 + 7], 0
26307pmaddubsw m4, m0, m6
26308pmulhrsw m4, m7
26309pslldq m2, 2
26310pinsrw m2, [r3 + 5], 0
26311pmaddubsw m5, m2, m6
26312pmulhrsw m5, m7
26313packuswb m4, m5
26314movu [r0 + 1294 * 16], m4
26315pslldq m1, 2
26316pinsrw m1, [r3 + 13], 0
26317pmaddubsw m4, m1, m6
26318pmulhrsw m4, m7
26319pslldq m3, 2
26320pinsrw m3, [r3 + 21], 0
26321pmaddubsw m5, m3, m6
26322pmulhrsw m5, m7
26323packuswb m4, m5
26324movu [r0 + 1295 * 16], m4
26325
26326; mode 22 [row 8]
26327movu m6, [r5 + 11 * 16]
26328pmaddubsw m4, m0, m6
26329pmulhrsw m4, m7
26330pmaddubsw m5, m2, m6
26331pmulhrsw m5, m7
26332packuswb m4, m5
26333movu [r0 + 1296 * 16], m4
26334pmaddubsw m4, m1, m6
26335pmulhrsw m4, m7
26336pmaddubsw m5, m3, m6
26337pmulhrsw m5, m7
26338packuswb m4, m5
26339movu [r0 + 1297 * 16], m4
26340
26341; mode 22 [row 9]
26342movu m6, [r5 + 30 * 16]
26343pslldq m0, 2
26344pinsrb m0, [r4 + 7], 1
26345pinsrb m0, [r4 + 10], 0
26346pmaddubsw m4, m0, m6
26347pmulhrsw m4, m7
26348pslldq m2, 2
26349pinsrw m2, [r3 + 4], 0
26350pmaddubsw m5, m2, m6
26351pmulhrsw m5, m7
26352packuswb m4, m5
26353movu [r0 + 1298 * 16], m4
26354pslldq m1, 2
26355pinsrw m1, [r3 + 12], 0
26356pmaddubsw m4, m1, m6
26357pmulhrsw m4, m7
26358pslldq m3, 2
26359pinsrw m3, [r3 + 20], 0
26360pmaddubsw m5, m3, m6
26361pmulhrsw m5, m7
26362packuswb m4, m5
26363movu [r0 + 1299 * 16], m4
26364
26365; mode 22 [row 10]
26366movu m6, [r5 + 17 * 16]
26367pmaddubsw m4, m0, m6
26368pmulhrsw m4, m7
26369pmaddubsw m5, m2, m6
26370pmulhrsw m5, m7
26371packuswb m4, m5
26372movu [r0 + 1300 * 16], m4
26373pmaddubsw m4, m1, m6
26374pmulhrsw m4, m7
26375pmaddubsw m5, m3, m6
26376pmulhrsw m5, m7
26377packuswb m4, m5
26378movu [r0 + 1301 * 16], m4
26379
26380; mode 22 [row 11]
26381movu m6, [r5 + 4 * 16]
26382pmaddubsw m4, m0, m6
26383pmulhrsw m4, m7
26384pmaddubsw m5, m2, m6
26385pmulhrsw m5, m7
26386packuswb m4, m5
26387movu [r0 + 1302 * 16], m4
26388pmaddubsw m4, m1, m6
26389pmulhrsw m4, m7
26390pmaddubsw m5, m3, m6
26391pmulhrsw m5, m7
26392packuswb m4, m5
26393movu [r0 + 1303 * 16], m4
26394
26395; mode 22 [row 12]
26396movu m6, [r5 + 23 * 16]
26397pslldq m0, 2
26398pinsrb m0, [r4 + 10], 1
26399pinsrb m0, [r4 + 12], 0
26400pmaddubsw m4, m0, m6
26401pmulhrsw m4, m7
26402pslldq m2, 2
26403pinsrw m2, [r3 + 3], 0
26404pmaddubsw m5, m2, m6
26405pmulhrsw m5, m7
26406packuswb m4, m5
26407movu [r0 + 1304 * 16], m4
26408pslldq m1, 2
26409pinsrw m1, [r3 + 11], 0
26410pmaddubsw m4, m1, m6
26411pmulhrsw m4, m7
26412pslldq m3, 2
26413pinsrw m3, [r3 + 19], 0
26414pmaddubsw m5, m3, m6
26415pmulhrsw m5, m7
26416packuswb m4, m5
26417movu [r0 + 1305 * 16], m4
26418
26419; mode 22 [row 13]
26420movu m6, [r5 + 10 * 16]
26421pmaddubsw m4, m0, m6
26422pmulhrsw m4, m7
26423pmaddubsw m5, m2, m6
26424pmulhrsw m5, m7
26425packuswb m4, m5
26426movu [r0 + 1306 * 16], m4
26427pmaddubsw m4, m1, m6
26428pmulhrsw m4, m7
26429pmaddubsw m5, m3, m6
26430pmulhrsw m5, m7
26431packuswb m4, m5
26432movu [r0 + 1307 * 16], m4
26433
26434; mode 22 [row 14]
26435movu m6, [r5 + 29 * 16]
26436pslldq m0, 2
26437pinsrb m0, [r4 + 12], 1
26438pinsrb m0, [r4 + 15], 0
26439pmaddubsw m4, m0, m6
26440pmulhrsw m4, m7
26441pslldq m2, 2
26442pinsrw m2, [r3 + 2], 0
26443pmaddubsw m5, m2, m6
26444pmulhrsw m5, m7
26445packuswb m4, m5
26446movu [r0 + 1308 * 16], m4
26447pslldq m1, 2
26448pinsrw m1, [r3 + 10], 0
26449pmaddubsw m4, m1, m6
26450pmulhrsw m4, m7
26451pslldq m3, 2
26452pinsrw m3, [r3 + 18], 0
26453pmaddubsw m5, m3, m6
26454pmulhrsw m5, m7
26455packuswb m4, m5
26456movu [r0 + 1309 * 16], m4
26457
26458; mode 22 [row 15]
26459movu m6, [r5 + 16 * 16]
26460pmaddubsw m4, m0, m6
26461pmulhrsw m4, m7
26462pmaddubsw m5, m2, m6
26463pmulhrsw m5, m7
26464packuswb m4, m5
26465movu [r0 + 1310 * 16], m4
26466pmaddubsw m4, m1, m6
26467pmulhrsw m4, m7
26468pmaddubsw m5, m3, m6
26469pmulhrsw m5, m7
26470packuswb m4, m5
26471movu [r0 + 1311 * 16], m4
26472
26473; mode 22 [row 16]
26474movu m6, [r5 + 3 * 16]
26475pmaddubsw m4, m0, m6
26476pmulhrsw m4, m7
26477pmaddubsw m5, m2, m6
26478pmulhrsw m5, m7
26479packuswb m4, m5
26480movu [r0 + 1312 * 16], m4
26481pmaddubsw m4, m1, m6
26482pmulhrsw m4, m7
26483pmaddubsw m5, m3, m6
26484pmulhrsw m5, m7
26485packuswb m4, m5
26486movu [r0 + 1313 * 16], m4
26487
26488; mode 22 [row 17]
26489movu m6, [r5 + 22 * 16]
26490pslldq m0, 2
26491pinsrb m0, [r4 + 15], 1
26492pinsrb m0, [r4 + 17], 0
26493pmaddubsw m4, m0, m6
26494pmulhrsw m4, m7
26495pslldq m2, 2
26496pinsrw m2, [r3 + 1], 0
26497pmaddubsw m5, m2, m6
26498pmulhrsw m5, m7
26499packuswb m4, m5
26500movu [r0 + 1314 * 16], m4
26501pslldq m1, 2
26502pinsrw m1, [r3 + 9], 0
26503pmaddubsw m4, m1, m6
26504pmulhrsw m4, m7
26505pslldq m3, 2
26506pinsrw m3, [r3 + 17], 0
26507pmaddubsw m5, m3, m6
26508pmulhrsw m5, m7
26509packuswb m4, m5
26510movu [r0 + 1315 * 16], m4
26511
26512; mode 22 [row 18]
26513movu m6, [r5 + 9 * 16]
26514pmaddubsw m4, m0, m6
26515pmulhrsw m4, m7
26516pmaddubsw m5, m2, m6
26517pmulhrsw m5, m7
26518packuswb m4, m5
26519movu [r0 + 1316 * 16], m4
26520pmaddubsw m4, m1, m6
26521pmulhrsw m4, m7
26522pmaddubsw m5, m3, m6
26523pmulhrsw m5, m7
26524packuswb m4, m5
26525movu [r0 + 1317 * 16], m4
26526
26527; mode 22 [row 19]
26528movu m6, [r5 + 28 * 16]
26529pslldq m0, 2
26530pinsrb m0, [r4 + 17], 1
26531pinsrb m0, [r4 + 20], 0
26532pmaddubsw m4, m0, m6
26533pmulhrsw m4, m7
26534pslldq m2, 2
26535pinsrw m2, [r3 + 0], 0
26536pmaddubsw m5, m2, m6
26537pmulhrsw m5, m7
26538packuswb m4, m5
26539movu [r0 + 1318 * 16], m4
26540pslldq m1, 2
26541pinsrw m1, [r3 + 8], 0
26542pmaddubsw m4, m1, m6
26543pmulhrsw m4, m7
26544pslldq m3, 2
26545pinsrw m3, [r3 + 16], 0
26546pmaddubsw m5, m3, m6
26547pmulhrsw m5, m7
26548packuswb m4, m5
26549movu [r0 + 1319 * 16], m4
26550
26551; mode 22 [row 20]
26552movu m6, [r5 + 15 * 16]
26553pmaddubsw m4, m0, m6
26554pmulhrsw m4, m7
26555pmaddubsw m5, m2, m6
26556pmulhrsw m5, m7
26557packuswb m4, m5
26558movu [r0 + 1320 * 16], m4
26559pmaddubsw m4, m1, m6
26560pmulhrsw m4, m7
26561pmaddubsw m5, m3, m6
26562pmulhrsw m5, m7
26563packuswb m4, m5
26564movu [r0 + 1321 * 16], m4
26565
26566; mode 22 [row 21]
26567movu m6, [r5 + 2 * 16]
26568pmaddubsw m4, m0, m6
26569pmulhrsw m4, m7
26570pmaddubsw m5, m2, m6
26571pmulhrsw m5, m7
26572packuswb m4, m5
26573movu [r0 + 1322 * 16], m4
26574pmaddubsw m4, m1, m6
26575pmulhrsw m4, m7
26576pmaddubsw m5, m3, m6
26577pmulhrsw m5, m7
26578packuswb m4, m5
26579movu [r0 + 1323 * 16], m4
26580
26581; mode 22 [row 22]
26582movu m6, [r5 + 21 * 16]
26583pslldq m0, 2
26584pinsrb m0, [r4 + 20], 1
26585pinsrb m0, [r4 + 22], 0
26586pmaddubsw m4, m0, m6
26587pmulhrsw m4, m7
26588pslldq m2, 2
26589pinsrb m2, [r4 + 0], 1
26590pinsrb m2, [r4 + 2], 0
26591pmaddubsw m5, m2, m6
26592pmulhrsw m5, m7
26593packuswb m4, m5
26594movu [r0 + 1324 * 16], m4
26595pslldq m1, 2
26596pinsrw m1, [r3 + 7], 0
26597pmaddubsw m4, m1, m6
26598pmulhrsw m4, m7
26599pslldq m3, 2
26600pinsrw m3, [r3 + 15], 0
26601pmaddubsw m5, m3, m6
26602pmulhrsw m5, m7
26603packuswb m4, m5
26604movu [r0 + 1325 * 16], m4
26605
26606; mode 22 [row 23]
26607movu m6, [r5 + 8 * 16]
26608pmaddubsw m4, m0, m6
26609pmulhrsw m4, m7
26610pmaddubsw m5, m2, m6
26611pmulhrsw m5, m7
26612packuswb m4, m5
26613movu [r0 + 1326 * 16], m4
26614pmaddubsw m4, m1, m6
26615pmulhrsw m4, m7
26616pmaddubsw m5, m3, m6
26617pmulhrsw m5, m7
26618packuswb m4, m5
26619movu [r0 + 1327 * 16], m4
26620
26621; mode 22 [row 24]
26622movu m6, [r5 + 27 * 16]
26623pslldq m0, 2
26624pinsrb m0, [r4 + 22], 1
26625pinsrb m0, [r4 + 25], 0
26626pmaddubsw m4, m0, m6
26627pmulhrsw m4, m7
26628pslldq m2, 2
26629pinsrb m2, [r4 + 2], 1
26630pinsrb m2, [r4 + 5], 0
26631pmaddubsw m5, m2, m6
26632pmulhrsw m5, m7
26633packuswb m4, m5
26634movu [r0 + 1328 * 16], m4
26635pslldq m1, 2
26636pinsrw m1, [r3 + 6], 0
26637pmaddubsw m4, m1, m6
26638pmulhrsw m4, m7
26639pslldq m3, 2
26640pinsrw m3, [r3 + 14], 0
26641pmaddubsw m5, m3, m6
26642pmulhrsw m5, m7
26643packuswb m4, m5
26644movu [r0 + 1329 * 16], m4
26645
26646; mode 22 [row 25]
26647movu m6, [r5 + 14 * 16]
26648pmaddubsw m4, m0, m6
26649pmulhrsw m4, m7
26650pmaddubsw m5, m2, m6
26651pmulhrsw m5, m7
26652packuswb m4, m5
26653movu [r0 + 1330 * 16], m4
26654pmaddubsw m4, m1, m6
26655pmulhrsw m4, m7
26656pmaddubsw m5, m3, m6
26657pmulhrsw m5, m7
26658packuswb m4, m5
26659movu [r0 + 1331 * 16], m4
26660
26661; mode 22 [row 26]
26662movu m6, [r5 + 1 * 16]
26663pmaddubsw m4, m0, m6
26664pmulhrsw m4, m7
26665pmaddubsw m5, m2, m6
26666pmulhrsw m5, m7
26667packuswb m4, m5
26668movu [r0 + 1332 * 16], m4
26669pmaddubsw m4, m1, m6
26670pmulhrsw m4, m7
26671pmaddubsw m5, m3, m6
26672pmulhrsw m5, m7
26673packuswb m4, m5
26674movu [r0 + 1333 * 16], m4
26675
26676; mode 22 [row 27]
26677movu m6, [r5 + 20 * 16]
26678pslldq m0, 2
26679pinsrb m0, [r4 + 25], 1
26680pinsrb m0, [r4 + 27], 0
26681pmaddubsw m4, m0, m6
26682pmulhrsw m4, m7
26683pslldq m2, 2
26684pinsrb m2, [r4 + 5], 1
26685pinsrb m2, [r4 + 7], 0
26686pmaddubsw m5, m2, m6
26687pmulhrsw m5, m7
26688packuswb m4, m5
26689movu [r0 + 1334 * 16], m4
26690pslldq m1, 2
26691pinsrw m1, [r3 + 5], 0
26692pmaddubsw m4, m1, m6
26693pmulhrsw m4, m7
26694pslldq m3, 2
26695pinsrw m3, [r3 + 13], 0
26696pmaddubsw m5, m3, m6
26697pmulhrsw m5, m7
26698packuswb m4, m5
26699movu [r0 + 1335 * 16], m4
26700
26701; mode 22 [row 28]
26702movu m6, [r5 + 7 * 16]
26703pmaddubsw m4, m0, m6
26704pmulhrsw m4, m7
26705pmaddubsw m5, m2, m6
26706pmulhrsw m5, m7
26707packuswb m4, m5
26708movu [r0 + 1336 * 16], m4
26709pmaddubsw m4, m1, m6
26710pmulhrsw m4, m7
26711pmaddubsw m5, m3, m6
26712pmulhrsw m5, m7
26713packuswb m4, m5
26714movu [r0 + 1337 * 16], m4
26715
26716; mode 22 [row 29]
26717movu m6, [r5 + 26 * 16]
26718pslldq m0, 2
26719pinsrb m0, [r4 + 27], 1
26720pinsrb m0, [r4 + 30], 0
26721pmaddubsw m4, m0, m6
26722pmulhrsw m4, m7
26723pslldq m2, 2
26724pinsrb m2, [r4 + 7], 1
26725pinsrb m2, [r4 + 10], 0
26726pmaddubsw m5, m2, m6
26727pmulhrsw m5, m7
26728packuswb m4, m5
26729movu [r0 + 1338 * 16], m4
26730pslldq m1, 2
26731pinsrw m1, [r3 + 4], 0
26732pmaddubsw m4, m1, m6
26733pmulhrsw m4, m7
26734pslldq m3, 2
26735pinsrw m3, [r3 + 12], 0
26736pmaddubsw m5, m3, m6
26737pmulhrsw m5, m7
26738packuswb m4, m5
26739movu [r0 + 1339 * 16], m4
26740
26741; mode 22 [row 30]
26742movu m6, [r5 + 13 * 16]
26743pmaddubsw m4, m0, m6
26744pmulhrsw m4, m7
26745pmaddubsw m5, m2, m6
26746pmulhrsw m5, m7
26747packuswb m4, m5
26748movu [r0 + 1340 * 16], m4
26749pmaddubsw m4, m1, m6
26750pmulhrsw m4, m7
26751pmaddubsw m5, m3, m6
26752pmulhrsw m5, m7
26753packuswb m4, m5
26754movu [r0 + 1341 * 16], m4
26755
26756; mode22 [row 31]
26757pshufb m5, m0, [tab_S2]
26758movh [r0 + 1342 * 16], m5
26759pshufb m5, m2, [tab_S2]
26760movh [r0 + 1342 * 16 + 8], m5
26761pshufb m5, m1, [tab_S2]
26762movh [r0 + 1343 * 16], m5
26763pshufb m5, m3, [tab_S2]
26764movh [r0 + 1343 * 16 + 8], m5
26765
26766; mode 23 [row 0]
26767movu m6, [r5 + 23 * 16]
26768movu m0, [r3 ]
26769movu m1, [r3 + 1 ]
26770punpcklbw m0, m1
26771pmaddubsw m1, m0, m6
26772pmulhrsw m1, m7
26773movu m2, [r3 + 8]
26774movu m3, [r3 + 9]
26775punpcklbw m2, m3
26776pmaddubsw m3, m2, m6
26777pmulhrsw m3, m7
26778packuswb m1, m3
26779movu [r0 + 1344 * 16], m1
26780
26781movu m1, [r3 + 16]
26782movu m3, [r3 + 17]
26783punpcklbw m1, m3
26784pmaddubsw m4, m1, m6
26785pmulhrsw m4, m7
26786movu m3, [r3 + 24]
26787movu m5, [r3 + 25]
26788punpcklbw m3, m5
26789pmaddubsw m5, m3, m6
26790pmulhrsw m5, m7
26791packuswb m4, m5
26792movu [r0 + 1345 * 16], m4
26793
26794; mode 23 [row 1]
26795movu m6, [r5 + 14 * 16]
26796pmaddubsw m4, m0, m6
26797pmulhrsw m4, m7
26798pmaddubsw m5, m2, m6
26799pmulhrsw m5, m7
26800packuswb m4, m5
26801movu [r0 + 1346 * 16], m4
26802pmaddubsw m4, m1, m6
26803pmulhrsw m4, m7
26804pmaddubsw m5, m3, m6
26805pmulhrsw m5, m7
26806packuswb m4, m5
26807movu [r0 + 1347 * 16], m4
26808
26809; mode 23 [row 2]
26810movu m6, [r5 + 5 * 16]
26811pmaddubsw m4, m0, m6
26812pmulhrsw m4, m7
26813pmaddubsw m5, m2, m6
26814pmulhrsw m5, m7
26815packuswb m4, m5
26816movu [r0 + 1348 * 16], m4
26817pmaddubsw m4, m1, m6
26818pmulhrsw m4, m7
26819pmaddubsw m5, m3, m6
26820pmulhrsw m5, m7
26821packuswb m4, m5
26822movu [r0 + 1349 * 16], m4
26823
26824; mode 23 [row 3]
26825movu m6, [r5 + 28 * 16]
26826pslldq m0, 2
26827pinsrb m0, [r4 + 0], 1
26828pinsrb m0, [r4 + 4], 0
26829pmaddubsw m4, m0, m6
26830pmulhrsw m4, m7
26831pslldq m2, 2
26832pinsrw m2, [r3 + 7], 0
26833pmaddubsw m5, m2, m6
26834pmulhrsw m5, m7
26835packuswb m4, m5
26836movu [r0 + 1350 * 16], m4
26837pslldq m1, 2
26838pinsrw m1, [r3 + 15], 0
26839pmaddubsw m4, m1, m6
26840pmulhrsw m4, m7
26841pslldq m3, 2
26842pinsrw m3, [r3 + 23], 0
26843pmaddubsw m5, m3, m6
26844pmulhrsw m5, m7
26845packuswb m4, m5
26846movu [r0 + 1351 * 16], m4
26847
26848; mode 23 [row 4]
26849movu m6, [r5 + 19 * 16]
26850pmaddubsw m4, m0, m6
26851pmulhrsw m4, m7
26852pmaddubsw m5, m2, m6
26853pmulhrsw m5, m7
26854packuswb m4, m5
26855movu [r0 + 1352 * 16], m4
26856pmaddubsw m4, m1, m6
26857pmulhrsw m4, m7
26858pmaddubsw m5, m3, m6
26859pmulhrsw m5, m7
26860packuswb m4, m5
26861movu [r0 + 1353 * 16], m4
26862
26863; mode 23 [row 5]
26864movu m6, [r5 + 10 * 16]
26865pmaddubsw m4, m0, m6
26866pmulhrsw m4, m7
26867pmaddubsw m5, m2, m6
26868pmulhrsw m5, m7
26869packuswb m4, m5
26870movu [r0 + 1354 * 16], m4
26871pmaddubsw m4, m1, m6
26872pmulhrsw m4, m7
26873pmaddubsw m5, m3, m6
26874pmulhrsw m5, m7
26875packuswb m4, m5
26876movu [r0 + 1355 * 16], m4
26877
26878; mode 23 [row 6]
26879movu m6, [r5 + 1 * 16]
26880pmaddubsw m4, m0, m6
26881pmulhrsw m4, m7
26882pmaddubsw m5, m2, m6
26883pmulhrsw m5, m7
26884packuswb m4, m5
26885movu [r0 + 1356 * 16], m4
26886pmaddubsw m4, m1, m6
26887pmulhrsw m4, m7
26888pmaddubsw m5, m3, m6
26889pmulhrsw m5, m7
26890packuswb m4, m5
26891movu [r0 + 1357 * 16], m4
26892
26893; mode 23 [row 7]
26894movu m6, [r5 + 24 * 16]
26895pslldq m0, 2
26896pinsrb m0, [r4 + 4], 1
26897pinsrb m0, [r4 + 7], 0
26898pmaddubsw m4, m0, m6
26899pmulhrsw m4, m7
26900pslldq m2, 2
26901pinsrw m2, [r3 + 6], 0
26902pmaddubsw m5, m2, m6
26903pmulhrsw m5, m7
26904packuswb m4, m5
26905movu [r0 + 1358 * 16], m4
26906pslldq m1, 2
26907pinsrw m1, [r3 + 14], 0
26908pmaddubsw m4, m1, m6
26909pmulhrsw m4, m7
26910pslldq m3, 2
26911pinsrw m3, [r3 + 22], 0
26912pmaddubsw m5, m3, m6
26913pmulhrsw m5, m7
26914packuswb m4, m5
26915movu [r0 + 1359 * 16], m4
26916
26917; mode 23 [row 8]
26918movu m6, [r5 + 15 * 16]
26919pmaddubsw m4, m0, m6
26920pmulhrsw m4, m7
26921pmaddubsw m5, m2, m6
26922pmulhrsw m5, m7
26923packuswb m4, m5
26924movu [r0 + 1360 * 16], m4
26925pmaddubsw m4, m1, m6
26926pmulhrsw m4, m7
26927pmaddubsw m5, m3, m6
26928pmulhrsw m5, m7
26929packuswb m4, m5
26930movu [r0 + 1361 * 16], m4
26931
26932; mode 23 [row 9]
26933movu m6, [r5 + 6 * 16]
26934pmaddubsw m4, m0, m6
26935pmulhrsw m4, m7
26936pmaddubsw m5, m2, m6
26937pmulhrsw m5, m7
26938packuswb m4, m5
26939movu [r0 + 1362 * 16], m4
26940pmaddubsw m4, m1, m6
26941pmulhrsw m4, m7
26942pmaddubsw m5, m3, m6
26943pmulhrsw m5, m7
26944packuswb m4, m5
26945movu [r0 + 1363 * 16], m4
26946
26947; mode 23 [row 10]
26948movu m6, [r5 + 29 * 16]
26949pslldq m0, 2
26950pinsrb m0, [r4 + 7], 1
26951pinsrb m0, [r4 + 11], 0
26952pmaddubsw m4, m0, m6
26953pmulhrsw m4, m7
26954pslldq m2, 2
26955pinsrw m2, [r3 + 5], 0
26956pmaddubsw m5, m2, m6
26957pmulhrsw m5, m7
26958packuswb m4, m5
26959movu [r0 + 1364 * 16], m4
26960pslldq m1, 2
26961pinsrw m1, [r3 + 13], 0
26962pmaddubsw m4, m1, m6
26963pmulhrsw m4, m7
26964pslldq m3, 2
26965pinsrw m3, [r3 + 21], 0
26966pmaddubsw m5, m3, m6
26967pmulhrsw m5, m7
26968packuswb m4, m5
26969movu [r0 + 1365 * 16], m4
26970
26971; mode 23 [row 11]
26972movu m6, [r5 + 20 * 16]
26973pmaddubsw m4, m0, m6
26974pmulhrsw m4, m7
26975pmaddubsw m5, m2, m6
26976pmulhrsw m5, m7
26977packuswb m4, m5
26978movu [r0 + 1366 * 16], m4
26979pmaddubsw m4, m1, m6
26980pmulhrsw m4, m7
26981pmaddubsw m5, m3, m6
26982pmulhrsw m5, m7
26983packuswb m4, m5
26984movu [r0 + 1367 * 16], m4
26985
26986; mode 23 [row 12]
26987movu m6, [r5 + 11 * 16]
26988pmaddubsw m4, m0, m6
26989pmulhrsw m4, m7
26990pmaddubsw m5, m2, m6
26991pmulhrsw m5, m7
26992packuswb m4, m5
26993movu [r0 + 1368 * 16], m4
26994pmaddubsw m4, m1, m6
26995pmulhrsw m4, m7
26996pmaddubsw m5, m3, m6
26997pmulhrsw m5, m7
26998packuswb m4, m5
26999movu [r0 + 1369 * 16], m4
27000
27001; mode 23 [row 13]
27002movu m6, [r5 + 2 * 16]
27003pmaddubsw m4, m0, m6
27004pmulhrsw m4, m7
27005pmaddubsw m5, m2, m6
27006pmulhrsw m5, m7
27007packuswb m4, m5
27008movu [r0 + 1370 * 16], m4
27009pmaddubsw m4, m1, m6
27010pmulhrsw m4, m7
27011pmaddubsw m5, m3, m6
27012pmulhrsw m5, m7
27013packuswb m4, m5
27014movu [r0 + 1371 * 16], m4
27015
27016; mode 23 [row 14]
27017movu m6, [r5 + 25 * 16]
27018pslldq m0, 2
27019pinsrb m0, [r4 + 11], 1
27020pinsrb m0, [r4 + 14], 0
27021pmaddubsw m4, m0, m6
27022pmulhrsw m4, m7
27023pslldq m2, 2
27024pinsrw m2, [r3 + 4], 0
27025pmaddubsw m5, m2, m6
27026pmulhrsw m5, m7
27027packuswb m4, m5
27028movu [r0 + 1372 * 16], m4
27029pslldq m1, 2
27030pinsrw m1, [r3 + 12], 0
27031pmaddubsw m4, m1, m6
27032pmulhrsw m4, m7
27033pslldq m3, 2
27034pinsrw m3, [r3 + 20], 0
27035pmaddubsw m5, m3, m6
27036pmulhrsw m5, m7
27037packuswb m4, m5
27038movu [r0 + 1373 * 16], m4
27039
27040; mode 23 [row 15]
27041movu m6, [r5 + 16 * 16]
27042pmaddubsw m4, m0, m6
27043pmulhrsw m4, m7
27044pmaddubsw m5, m2, m6
27045pmulhrsw m5, m7
27046packuswb m4, m5
27047movu [r0 + 1374 * 16], m4
27048pmaddubsw m4, m1, m6
27049pmulhrsw m4, m7
27050pmaddubsw m5, m3, m6
27051pmulhrsw m5, m7
27052packuswb m4, m5
27053movu [r0 + 1375 * 16], m4
27054
27055; mode 23 [row 16]
27056movu m6, [r5 + 7 * 16]
27057pmaddubsw m4, m0, m6
27058pmulhrsw m4, m7
27059pmaddubsw m5, m2, m6
27060pmulhrsw m5, m7
27061packuswb m4, m5
27062movu [r0 + 1376 * 16], m4
27063pmaddubsw m4, m1, m6
27064pmulhrsw m4, m7
27065pmaddubsw m5, m3, m6
27066pmulhrsw m5, m7
27067packuswb m4, m5
27068movu [r0 + 1377 * 16], m4
27069
27070; mode 23 [row 17]
27071movu m6, [r5 + 30 * 16]
27072pslldq m0, 2
27073pinsrb m0, [r4 + 14], 1
27074pinsrb m0, [r4 + 18], 0
27075pmaddubsw m4, m0, m6
27076pmulhrsw m4, m7
27077pslldq m2, 2
27078pinsrw m2, [r3 + 3], 0
27079pmaddubsw m5, m2, m6
27080pmulhrsw m5, m7
27081packuswb m4, m5
27082movu [r0 + 1378 * 16], m4
27083pslldq m1, 2
27084pinsrw m1, [r3 + 11], 0
27085pmaddubsw m4, m1, m6
27086pmulhrsw m4, m7
27087pslldq m3, 2
27088pinsrw m3, [r3 + 19], 0
27089pmaddubsw m5, m3, m6
27090pmulhrsw m5, m7
27091packuswb m4, m5
27092movu [r0 + 1379 * 16], m4
27093
27094; mode 23 [row 18]
27095movu m6, [r5 + 21 * 16]
27096pmaddubsw m4, m0, m6
27097pmulhrsw m4, m7
27098pmaddubsw m5, m2, m6
27099pmulhrsw m5, m7
27100packuswb m4, m5
27101movu [r0 + 1380 * 16], m4
27102pmaddubsw m4, m1, m6
27103pmulhrsw m4, m7
27104pmaddubsw m5, m3, m6
27105pmulhrsw m5, m7
27106packuswb m4, m5
27107movu [r0 + 1381 * 16], m4
27108
27109; mode 23 [row 19]
27110movu m6, [r5 + 12 * 16]
27111pmaddubsw m4, m0, m6
27112pmulhrsw m4, m7
27113pmaddubsw m5, m2, m6
27114pmulhrsw m5, m7
27115packuswb m4, m5
27116movu [r0 + 1382 * 16], m4
27117pmaddubsw m4, m1, m6
27118pmulhrsw m4, m7
27119pmaddubsw m5, m3, m6
27120pmulhrsw m5, m7
27121packuswb m4, m5
27122movu [r0 + 1383 * 16], m4
27123
27124; mode 23 [row 20]
27125movu m6, [r5 + 3 * 16]
27126pmaddubsw m4, m0, m6
27127pmulhrsw m4, m7
27128pmaddubsw m5, m2, m6
27129pmulhrsw m5, m7
27130packuswb m4, m5
27131movu [r0 + 1384 * 16], m4
27132pmaddubsw m4, m1, m6
27133pmulhrsw m4, m7
27134pmaddubsw m5, m3, m6
27135pmulhrsw m5, m7
27136packuswb m4, m5
27137movu [r0 + 1385 * 16], m4
27138
27139; mode 23 [row 21]
27140movu m6, [r5 + 26 * 16]
27141pslldq m0, 2
27142pinsrb m0, [r4 + 18], 1
27143pinsrb m0, [r4 + 21], 0
27144pmaddubsw m4, m0, m6
27145pmulhrsw m4, m7
27146pslldq m2, 2
27147pinsrw m2, [r3 + 2], 0
27148pmaddubsw m5, m2, m6
27149pmulhrsw m5, m7
27150packuswb m4, m5
27151movu [r0 + 1386 * 16], m4
27152pslldq m1, 2
27153pinsrw m1, [r3 + 10], 0
27154pmaddubsw m4, m1, m6
27155pmulhrsw m4, m7
27156pslldq m3, 2
27157pinsrw m3, [r3 + 18], 0
27158pmaddubsw m5, m3, m6
27159pmulhrsw m5, m7
27160packuswb m4, m5
27161movu [r0 + 1387 * 16], m4
27162
27163; mode 23 [row 22]
27164movu m6, [r5 + 17 * 16]
27165pmaddubsw m4, m0, m6
27166pmulhrsw m4, m7
27167pmaddubsw m5, m2, m6
27168pmulhrsw m5, m7
27169packuswb m4, m5
27170movu [r0 + 1388 * 16], m4
27171pmaddubsw m4, m1, m6
27172pmulhrsw m4, m7
27173pmaddubsw m5, m3, m6
27174pmulhrsw m5, m7
27175packuswb m4, m5
27176movu [r0 + 1389 * 16], m4
27177
27178; mode 23 [row 23]
27179movu m6, [r5 + 8 * 16]
27180pmaddubsw m4, m0, m6
27181pmulhrsw m4, m7
27182pmaddubsw m5, m2, m6
27183pmulhrsw m5, m7
27184packuswb m4, m5
27185movu [r0 + 1390 * 16], m4
27186pmaddubsw m4, m1, m6
27187pmulhrsw m4, m7
27188pmaddubsw m5, m3, m6
27189pmulhrsw m5, m7
27190packuswb m4, m5
27191movu [r0 + 1391 * 16], m4
27192
27193; mode 23 [row 24]
27194movu m6, [r5 + 31 * 16]
27195pslldq m0, 2
27196pinsrb m0, [r4 + 21], 1
27197pinsrb m0, [r4 + 25], 0
27198pmaddubsw m4, m0, m6
27199pmulhrsw m4, m7
27200pslldq m2, 2
27201pinsrw m2, [r3 + 1], 0
27202pmaddubsw m5, m2, m6
27203pmulhrsw m5, m7
27204packuswb m4, m5
27205movu [r0 + 1392 * 16], m4
27206pslldq m1, 2
27207pinsrw m1, [r3 + 9], 0
27208pmaddubsw m4, m1, m6
27209pmulhrsw m4, m7
27210pslldq m3, 2
27211pinsrw m3, [r3 + 17], 0
27212pmaddubsw m5, m3, m6
27213pmulhrsw m5, m7
27214packuswb m4, m5
27215movu [r0 + 1393 * 16], m4
27216
27217; mode 23 [row 25]
27218movu m6, [r5 + 22 * 16]
27219pmaddubsw m4, m0, m6
27220pmulhrsw m4, m7
27221pmaddubsw m5, m2, m6
27222pmulhrsw m5, m7
27223packuswb m4, m5
27224movu [r0 + 1394 * 16], m4
27225pmaddubsw m4, m1, m6
27226pmulhrsw m4, m7
27227pmaddubsw m5, m3, m6
27228pmulhrsw m5, m7
27229packuswb m4, m5
27230movu [r0 + 1395 * 16], m4
27231
27232; mode 23 [row 26]
27233movu m6, [r5 + 13 * 16]
27234pmaddubsw m4, m0, m6
27235pmulhrsw m4, m7
27236pmaddubsw m5, m2, m6
27237pmulhrsw m5, m7
27238packuswb m4, m5
27239movu [r0 + 1396 * 16], m4
27240pmaddubsw m4, m1, m6
27241pmulhrsw m4, m7
27242pmaddubsw m5, m3, m6
27243pmulhrsw m5, m7
27244packuswb m4, m5
27245movu [r0 + 1397 * 16], m4
27246
27247; mode 23 [row 27]
27248movu m6, [r5 + 4 * 16]
27249pmaddubsw m4, m0, m6
27250pmulhrsw m4, m7
27251pmaddubsw m5, m2, m6
27252pmulhrsw m5, m7
27253packuswb m4, m5
27254movu [r0 + 1398 * 16], m4
27255pmaddubsw m4, m1, m6
27256pmulhrsw m4, m7
27257pmaddubsw m5, m3, m6
27258pmulhrsw m5, m7
27259packuswb m4, m5
27260movu [r0 + 1399 * 16], m4
27261
27262; mode 23 [row 28]
27263movu m6, [r5 + 27 * 16]
27264pslldq m0, 2
27265pinsrb m0, [r4 + 25], 1
27266pinsrb m0, [r4 + 28], 0
27267pmaddubsw m4, m0, m6
27268pmulhrsw m4, m7
27269pslldq m2, 2
27270pinsrw m2, [r3 + 0], 0
27271pmaddubsw m5, m2, m6
27272pmulhrsw m5, m7
27273packuswb m4, m5
27274movu [r0 + 1400 * 16], m4
27275pslldq m1, 2
27276pinsrw m1, [r3 + 8], 0
27277pmaddubsw m4, m1, m6
27278pmulhrsw m4, m7
27279pslldq m3, 2
27280pinsrw m3, [r3 + 16], 0
27281pmaddubsw m5, m3, m6
27282pmulhrsw m5, m7
27283packuswb m4, m5
27284movu [r0 + 1401 * 16], m4
27285
27286; mode 23 [row 29]
27287movu m6, [r5 + 18 * 16]
27288pmaddubsw m4, m0, m6
27289pmulhrsw m4, m7
27290pmaddubsw m5, m2, m6
27291pmulhrsw m5, m7
27292packuswb m4, m5
27293movu [r0 + 1402 * 16], m4
27294pmaddubsw m4, m1, m6
27295pmulhrsw m4, m7
27296pmaddubsw m5, m3, m6
27297pmulhrsw m5, m7
27298packuswb m4, m5
27299movu [r0 + 1403 * 16], m4
27300
27301; mode 23 [row 30]
27302movu m6, [r5 + 9 * 16]
27303pmaddubsw m4, m0, m6
27304pmulhrsw m4, m7
27305pmaddubsw m5, m2, m6
27306pmulhrsw m5, m7
27307packuswb m4, m5
27308movu [r0 + 1404 * 16], m4
27309pmaddubsw m4, m1, m6
27310pmulhrsw m4, m7
27311pmaddubsw m5, m3, m6
27312pmulhrsw m5, m7
27313packuswb m4, m5
27314movu [r0 + 1405 * 16], m4
27315
27316; mode23 [row 31]
27317pshufb m5, m0, [tab_S2]
27318movh [r0 + 1406 * 16], m5
27319pshufb m5, m2, [tab_S2]
27320movh [r0 + 1406 * 16 + 8], m5
27321pshufb m5, m1, [tab_S2]
27322movh [r0 + 1407 * 16], m5
27323pshufb m5, m3, [tab_S2]
27324movh [r0 + 1407 * 16 + 8], m5
27325
27326; mode 24 [row 0]
27327movu m6, [r5 + 27 * 16]
27328movu m0, [r3 ]
27329movu m1, [r3 + 1 ]
27330punpcklbw m0, m1
27331pmaddubsw m4, m0, m6
27332pmulhrsw m4, m7
27333movu m2, [r3 + 8]
27334movu m3, [r3 + 9]
27335punpcklbw m2, m3
27336pmaddubsw m5, m2, m6
27337pmulhrsw m5, m7
27338packuswb m4, m5
27339movu [r0 + 1408 * 16], m4
27340
27341movu m1, [r3 + 16]
27342movu m3, [r3 + 17]
27343punpcklbw m1, m3
27344pmaddubsw m4, m1, m6
27345pmulhrsw m4, m7
27346movu m3, [r3 + 24]
27347movu m5, [r3 + 25]
27348punpcklbw m3, m5
27349pmaddubsw m5, m3, m6
27350pmulhrsw m5, m7
27351packuswb m4, m5
27352movu [r0 + 1409 * 16], m4
27353
27354; mode 24 [row 1]
27355movu m6, [r5 + 22 * 16]
27356pmaddubsw m4, m0, m6
27357pmulhrsw m4, m7
27358pmaddubsw m5, m2, m6
27359pmulhrsw m5, m7
27360packuswb m4, m5
27361movu [r0 + 1410 * 16], m4
27362pmaddubsw m4, m1, m6
27363pmulhrsw m4, m7
27364pmaddubsw m5, m3, m6
27365pmulhrsw m5, m7
27366packuswb m4, m5
27367movu [r0 + 1411 * 16], m4
27368
27369; mode 24 [row 2]
27370movu m6, [r5 + 17 * 16]
27371pmaddubsw m4, m0, m6
27372pmulhrsw m4, m7
27373pmaddubsw m5, m2, m6
27374pmulhrsw m5, m7
27375packuswb m4, m5
27376movu [r0 + 1412 * 16], m4
27377pmaddubsw m4, m1, m6
27378pmulhrsw m4, m7
27379pmaddubsw m5, m3, m6
27380pmulhrsw m5, m7
27381packuswb m4, m5
27382movu [r0 + 1413 * 16], m4
27383
27384; mode 24 [row 3]
27385movu m6, [r5 + 12 * 16]
27386pmaddubsw m4, m0, m6
27387pmulhrsw m4, m7
27388pmaddubsw m5, m2, m6
27389pmulhrsw m5, m7
27390packuswb m4, m5
27391movu [r0 + 1414 * 16], m4
27392pmaddubsw m4, m1, m6
27393pmulhrsw m4, m7
27394pmaddubsw m5, m3, m6
27395pmulhrsw m5, m7
27396packuswb m4, m5
27397movu [r0 + 1415 * 16], m4
27398
27399; mode 24 [row 4]
27400movu m6, [r5 + 7 * 16]
27401pmaddubsw m4, m0, m6
27402pmulhrsw m4, m7
27403pmaddubsw m5, m2, m6
27404pmulhrsw m5, m7
27405packuswb m4, m5
27406movu [r0 + 1416 * 16], m4
27407pmaddubsw m4, m1, m6
27408pmulhrsw m4, m7
27409pmaddubsw m5, m3, m6
27410pmulhrsw m5, m7
27411packuswb m4, m5
27412movu [r0 + 1417 * 16], m4
27413
27414; mode 24 [row 5]
27415movu m6, [r5 + 2 * 16]
27416pmaddubsw m4, m0, m6
27417pmulhrsw m4, m7
27418pmaddubsw m5, m2, m6
27419pmulhrsw m5, m7
27420packuswb m4, m5
27421movu [r0 + 1418 * 16], m4
27422pmaddubsw m4, m1, m6
27423pmulhrsw m4, m7
27424pmaddubsw m5, m3, m6
27425pmulhrsw m5, m7
27426packuswb m4, m5
27427movu [r0 + 1419 * 16], m4
27428
27429; mode 24 [row 6]
27430movu m6, [r5 + 29 * 16]
27431pslldq m0, 2
27432pinsrb m0, [r4 + 0], 1
27433pinsrb m0, [r4 + 6], 0
27434pmaddubsw m4, m0, m6
27435pmulhrsw m4, m7
27436pslldq m2, 2
27437pinsrw m2, [r3 + 7], 0
27438pmaddubsw m5, m2, m6
27439pmulhrsw m5, m7
27440packuswb m4, m5
27441movu [r0 + 1420 * 16], m4
27442pslldq m1, 2
27443pinsrw m1, [r3 + 15], 0
27444pmaddubsw m4, m1, m6
27445pmulhrsw m4, m7
27446pslldq m3, 2
27447pinsrw m3, [r3 + 23], 0
27448pmaddubsw m5, m3, m6
27449pmulhrsw m5, m7
27450packuswb m4, m5
27451movu [r0 + 1421 * 16], m4
27452
27453; mode 24 [row 7]
27454movu m6, [r5 + 24 * 16]
27455pmaddubsw m4, m0, m6
27456pmulhrsw m4, m7
27457pmaddubsw m5, m2, m6
27458pmulhrsw m5, m7
27459packuswb m4, m5
27460movu [r0 + 1422 * 16], m4
27461pmaddubsw m4, m1, m6
27462pmulhrsw m4, m7
27463pmaddubsw m5, m3, m6
27464pmulhrsw m5, m7
27465packuswb m4, m5
27466movu [r0 + 1423 * 16], m4
27467
27468; mode 24 [row 8]
27469movu m6, [r5 + 19 * 16]
27470pmaddubsw m4, m0, m6
27471pmulhrsw m4, m7
27472pmaddubsw m5, m2, m6
27473pmulhrsw m5, m7
27474packuswb m4, m5
27475movu [r0 + 1424 * 16], m4
27476pmaddubsw m4, m1, m6
27477pmulhrsw m4, m7
27478pmaddubsw m5, m3, m6
27479pmulhrsw m5, m7
27480packuswb m4, m5
27481movu [r0 + 1425 * 16], m4
27482
27483; mode 24 [row 9]
27484movu m6, [r5 + 14 * 16]
27485pmaddubsw m4, m0, m6
27486pmulhrsw m4, m7
27487pmaddubsw m5, m2, m6
27488pmulhrsw m5, m7
27489packuswb m4, m5
27490movu [r0 + 1426 * 16], m4
27491pmaddubsw m4, m1, m6
27492pmulhrsw m4, m7
27493pmaddubsw m5, m3, m6
27494pmulhrsw m5, m7
27495packuswb m4, m5
27496movu [r0 + 1427 * 16], m4
27497
27498; mode 24 [row 10]
27499movu m6, [r5 + 9 * 16]
27500pmaddubsw m4, m0, m6
27501pmulhrsw m4, m7
27502pmaddubsw m5, m2, m6
27503pmulhrsw m5, m7
27504packuswb m4, m5
27505movu [r0 + 1428 * 16], m4
27506pmaddubsw m4, m1, m6
27507pmulhrsw m4, m7
27508pmaddubsw m5, m3, m6
27509pmulhrsw m5, m7
27510packuswb m4, m5
27511movu [r0 + 1429 * 16], m4
27512
27513; mode 24 [row 11]
27514movu m6, [r5 + 4 * 16]
27515pmaddubsw m4, m0, m6
27516pmulhrsw m4, m7
27517pmaddubsw m5, m2, m6
27518pmulhrsw m5, m7
27519packuswb m4, m5
27520movu [r0 + 1430 * 16], m4
27521pmaddubsw m4, m1, m6
27522pmulhrsw m4, m7
27523pmaddubsw m5, m3, m6
27524pmulhrsw m5, m7
27525packuswb m4, m5
27526movu [r0 + 1431 * 16], m4
27527
27528; mode 24 [row 12]
27529movu m6, [r5 + 31 * 16]
27530pslldq m0, 2
27531pinsrb m0, [r4 + 6], 1
27532pinsrb m0, [r4 + 13], 0
27533pmaddubsw m4, m0, m6
27534pmulhrsw m4, m7
27535pslldq m2, 2
27536pinsrw m2, [r3 + 6], 0
27537pmaddubsw m5, m2, m6
27538pmulhrsw m5, m7
27539packuswb m4, m5
27540movu [r0 + 1432 * 16], m4
27541pslldq m1, 2
27542pinsrw m1, [r3 + 14], 0
27543pmaddubsw m4, m1, m6
27544pmulhrsw m4, m7
27545pslldq m3, 2
27546pinsrw m3, [r3 + 22], 0
27547pmaddubsw m5, m3, m6
27548pmulhrsw m5, m7
27549packuswb m4, m5
27550movu [r0 + 1433 * 16], m4
27551
27552; mode 24 [row 13]
27553movu m6, [r5 + 26 * 16]
27554pmaddubsw m4, m0, m6
27555pmulhrsw m4, m7
27556pmaddubsw m5, m2, m6
27557pmulhrsw m5, m7
27558packuswb m4, m5
27559movu [r0 + 1434 * 16], m4
27560pmaddubsw m4, m1, m6
27561pmulhrsw m4, m7
27562pmaddubsw m5, m3, m6
27563pmulhrsw m5, m7
27564packuswb m4, m5
27565movu [r0 + 1435 * 16], m4
27566
27567; mode 24 [row 14]
27568movu m6, [r5 + 21 * 16]
27569pmaddubsw m4, m0, m6
27570pmulhrsw m4, m7
27571pmaddubsw m5, m2, m6
27572pmulhrsw m5, m7
27573packuswb m4, m5
27574movu [r0 + 1436 * 16], m4
27575pmaddubsw m4, m1, m6
27576pmulhrsw m4, m7
27577pmaddubsw m5, m3, m6
27578pmulhrsw m5, m7
27579packuswb m4, m5
27580movu [r0 + 1437 * 16], m4
27581
27582; mode 24 [row 15]
27583movu m6, [r5 + 16 * 16]
27584pmaddubsw m4, m0, m6
27585pmulhrsw m4, m7
27586pmaddubsw m5, m2, m6
27587pmulhrsw m5, m7
27588packuswb m4, m5
27589movu [r0 + 1438 * 16], m4
27590pmaddubsw m4, m1, m6
27591pmulhrsw m4, m7
27592pmaddubsw m5, m3, m6
27593pmulhrsw m5, m7
27594packuswb m4, m5
27595movu [r0 + 1439 * 16], m4
27596
27597; mode 24 [row 16]
27598movu m6, [r5 + 11 * 16]
27599pmaddubsw m4, m0, m6
27600pmulhrsw m4, m7
27601pmaddubsw m5, m2, m6
27602pmulhrsw m5, m7
27603packuswb m4, m5
27604movu [r0 + 1440 * 16], m4
27605pmaddubsw m4, m1, m6
27606pmulhrsw m4, m7
27607pmaddubsw m5, m3, m6
27608pmulhrsw m5, m7
27609packuswb m4, m5
27610movu [r0 + 1441 * 16], m4
27611
27612; mode 24 [row 17]
27613movu m6, [r5 + 6 * 16]
27614pmaddubsw m4, m0, m6
27615pmulhrsw m4, m7
27616pmaddubsw m5, m2, m6
27617pmulhrsw m5, m7
27618packuswb m4, m5
27619movu [r0 + 1442 * 16], m4
27620pmaddubsw m4, m1, m6
27621pmulhrsw m4, m7
27622pmaddubsw m5, m3, m6
27623pmulhrsw m5, m7
27624packuswb m4, m5
27625movu [r0 + 1443 * 16], m4
27626
27627; mode 24 [row 18]
27628movu m6, [r5 + 1 * 16]
27629pmaddubsw m4, m0, m6
27630pmulhrsw m4, m7
27631pmaddubsw m5, m2, m6
27632pmulhrsw m5, m7
27633packuswb m4, m5
27634movu [r0 + 1444 * 16], m4
27635pmaddubsw m4, m1, m6
27636pmulhrsw m4, m7
27637pmaddubsw m5, m3, m6
27638pmulhrsw m5, m7
27639packuswb m4, m5
27640movu [r0 + 1445 * 16], m4
27641
27642; mode 24 [row 19]
27643movu m6, [r5 + 28 * 16]
27644pslldq m0, 2
27645pinsrb m0, [r4 + 13], 1
27646pinsrb m0, [r4 + 19], 0
27647pmaddubsw m4, m0, m6
27648pmulhrsw m4, m7
27649pslldq m2, 2
27650pinsrw m2, [r3 + 5], 0
27651pmaddubsw m5, m2, m6
27652pmulhrsw m5, m7
27653packuswb m4, m5
27654movu [r0 + 1446 * 16], m4
27655pslldq m1, 2
27656pinsrw m1, [r3 + 13], 0
27657pmaddubsw m4, m1, m6
27658pmulhrsw m4, m7
27659pslldq m3, 2
27660pinsrw m3, [r3 + 21], 0
27661pmaddubsw m5, m3, m6
27662pmulhrsw m5, m7
27663packuswb m4, m5
27664movu [r0 + 1447 * 16], m4
27665
27666; mode 24 [row 20]
27667movu m6, [r5 + 23 * 16]
27668pmaddubsw m4, m0, m6
27669pmulhrsw m4, m7
27670pmaddubsw m5, m2, m6
27671pmulhrsw m5, m7
27672packuswb m4, m5
27673movu [r0 + 1448 * 16], m4
27674pmaddubsw m4, m1, m6
27675pmulhrsw m4, m7
27676pmaddubsw m5, m3, m6
27677pmulhrsw m5, m7
27678packuswb m4, m5
27679movu [r0 + 1449 * 16], m4
27680
27681; mode 24 [row 21]
27682movu m6, [r5 + 18 * 16]
27683pmaddubsw m4, m0, m6
27684pmulhrsw m4, m7
27685pmaddubsw m5, m2, m6
27686pmulhrsw m5, m7
27687packuswb m4, m5
27688movu [r0 + 1450 * 16], m4
27689pmaddubsw m4, m1, m6
27690pmulhrsw m4, m7
27691pmaddubsw m5, m3, m6
27692pmulhrsw m5, m7
27693packuswb m4, m5
27694movu [r0 + 1451 * 16], m4
27695
27696; mode 24 [row 22]
27697movu m6, [r5 + 13 * 16]
27698pmaddubsw m4, m0, m6
27699pmulhrsw m4, m7
27700pmaddubsw m5, m2, m6
27701pmulhrsw m5, m7
27702packuswb m4, m5
27703movu [r0 + 1452 * 16], m4
27704pmaddubsw m4, m1, m6
27705pmulhrsw m4, m7
27706pmaddubsw m5, m3, m6
27707pmulhrsw m5, m7
27708packuswb m4, m5
27709movu [r0 + 1453 * 16], m4
27710
27711; mode 24 [row 23]
27712movu m6, [r5 + 8 * 16]
27713pmaddubsw m4, m0, m6
27714pmulhrsw m4, m7
27715pmaddubsw m5, m2, m6
27716pmulhrsw m5, m7
27717packuswb m4, m5
27718movu [r0 + 1454 * 16], m4
27719pmaddubsw m4, m1, m6
27720pmulhrsw m4, m7
27721pmaddubsw m5, m3, m6
27722pmulhrsw m5, m7
27723packuswb m4, m5
27724movu [r0 + 1455 * 16], m4
27725
27726; mode 24 [row 24]
27727movu m6, [r5 + 3 * 16]
27728pmaddubsw m4, m0, m6
27729pmulhrsw m4, m7
27730pmaddubsw m5, m2, m6
27731pmulhrsw m5, m7
27732packuswb m4, m5
27733movu [r0 + 1456 * 16], m4
27734pmaddubsw m4, m1, m6
27735pmulhrsw m4, m7
27736pmaddubsw m5, m3, m6
27737pmulhrsw m5, m7
27738packuswb m4, m5
27739movu [r0 + 1457 * 16], m4
27740
27741; mode 24 [row 25]
27742movu m6, [r5 + 30 * 16]
27743pslldq m0, 2
27744pinsrb m0, [r4 + 19], 1
27745pinsrb m0, [r4 + 26], 0
27746pmaddubsw m4, m0, m6
27747pmulhrsw m4, m7
27748pslldq m2, 2
27749pinsrw m2, [r3 + 4], 0
27750pmaddubsw m5, m2, m6
27751pmulhrsw m5, m7
27752packuswb m4, m5
27753movu [r0 + 1458 * 16], m4
27754pslldq m1, 2
27755pinsrw m1, [r3 + 12], 0
27756pmaddubsw m4, m1, m6
27757pmulhrsw m4, m7
27758pslldq m3, 2
27759pinsrw m3, [r3 + 20], 0
27760pmaddubsw m5, m3, m6
27761pmulhrsw m5, m7
27762packuswb m4, m5
27763movu [r0 + 1459 * 16], m4
27764
27765; mode 24 [row 26]
27766movu m6, [r5 + 25 * 16]
27767pmaddubsw m4, m0, m6
27768pmulhrsw m4, m7
27769pmaddubsw m5, m2, m6
27770pmulhrsw m5, m7
27771packuswb m4, m5
27772movu [r0 + 1460 * 16], m4
27773pmaddubsw m4, m1, m6
27774pmulhrsw m4, m7
27775pmaddubsw m5, m3, m6
27776pmulhrsw m5, m7
27777packuswb m4, m5
27778movu [r0 + 1461 * 16], m4
27779
27780; mode 24 [row 27]
27781movu m6, [r5 + 20 * 16]
27782pmaddubsw m4, m0, m6
27783pmulhrsw m4, m7
27784pmaddubsw m5, m2, m6
27785pmulhrsw m5, m7
27786packuswb m4, m5
27787movu [r0 + 1462 * 16], m4
27788pmaddubsw m4, m1, m6
27789pmulhrsw m4, m7
27790pmaddubsw m5, m3, m6
27791pmulhrsw m5, m7
27792packuswb m4, m5
27793movu [r0 + 1463 * 16], m4
27794
27795; mode 24 [row 28]
27796movu m6, [r5 + 15 * 16]
27797pmaddubsw m4, m0, m6
27798pmulhrsw m4, m7
27799pmaddubsw m5, m2, m6
27800pmulhrsw m5, m7
27801packuswb m4, m5
27802movu [r0 + 1464 * 16], m4
27803pmaddubsw m4, m1, m6
27804pmulhrsw m4, m7
27805pmaddubsw m5, m3, m6
27806pmulhrsw m5, m7
27807packuswb m4, m5
27808movu [r0 + 1465 * 16], m4
27809
27810; mode 24 [row 29]
27811movu m6, [r5 + 10 * 16]
27812pmaddubsw m4, m0, m6
27813pmulhrsw m4, m7
27814pmaddubsw m5, m2, m6
27815pmulhrsw m5, m7
27816packuswb m4, m5
27817movu [r0 + 1466 * 16], m4
27818pmaddubsw m4, m1, m6
27819pmulhrsw m4, m7
27820pmaddubsw m5, m3, m6
27821pmulhrsw m5, m7
27822packuswb m4, m5
27823movu [r0 + 1467 * 16], m4
27824
27825; mode 24 [row 30]
27826movu m6, [r5 + 5 * 16]
27827pmaddubsw m4, m0, m6
27828pmulhrsw m4, m7
27829pmaddubsw m5, m2, m6
27830pmulhrsw m5, m7
27831packuswb m4, m5
27832movu [r0 + 1468 * 16], m4
27833pmaddubsw m4, m1, m6
27834pmulhrsw m4, m7
27835pmaddubsw m5, m3, m6
27836pmulhrsw m5, m7
27837packuswb m4, m5
27838movu [r0 + 1469 * 16], m4
27839
27840; mode 24 [row 31]
27841pshufb m5, m0, [tab_S2]
27842movh [r0 + 1470 * 16], m5
27843pshufb m5, m2, [tab_S2]
27844movh [r0 + 1470 * 16 + 8], m5
27845pshufb m5, m1, [tab_S2]
27846movh [r0 + 1471 * 16], m5
27847pshufb m5, m3, [tab_S2]
27848movh [r0 + 1471 * 16 + 8], m5
27849
27850; mode 25 [row 0]
27851movu m6, [r5 + 30 * 16]
27852movu m0, [r3 ]
27853movu m1, [r3 + 1 ]
27854punpcklbw m0, m1
27855pmaddubsw m4, m0, m6
27856pmulhrsw m4, m7
27857movu m2, [r3 + 8]
27858movu m3, [r3 + 9]
27859punpcklbw m2, m3
27860pmaddubsw m5, m2, m6
27861pmulhrsw m5, m7
27862packuswb m4, m5
27863movu [r0 + 1472 * 16], m4
27864
27865movu m1, [r3 + 16]
27866movu m3, [r3 + 17]
27867punpcklbw m1, m3
27868pmaddubsw m4, m1, m6
27869pmulhrsw m4, m7
27870movu m3, [r3 + 24]
27871movu m5, [r3 + 25]
27872punpcklbw m3, m5
27873pmaddubsw m5, m3, m6
27874pmulhrsw m5, m7
27875packuswb m4, m5
27876movu [r0 + 1473 * 16], m4
27877
27878; mode 25 [row 1]
27879movu m6, [r5 + 28 * 16]
27880pmaddubsw m4, m0, m6
27881pmulhrsw m4, m7
27882pmaddubsw m5, m2, m6
27883pmulhrsw m5, m7
27884packuswb m4, m5
27885movu [r0 + 1474 * 16], m4
27886pmaddubsw m4, m1, m6
27887pmulhrsw m4, m7
27888pmaddubsw m5, m3, m6
27889pmulhrsw m5, m7
27890packuswb m4, m5
27891movu [r0 + 1475 * 16], m4
27892
27893; mode 25 [row 2]
27894movu m6, [r5 + 26 * 16]
27895pmaddubsw m4, m0, m6
27896pmulhrsw m4, m7
27897pmaddubsw m5, m2, m6
27898pmulhrsw m5, m7
27899packuswb m4, m5
27900movu [r0 + 1476 * 16], m4
27901pmaddubsw m4, m1, m6
27902pmulhrsw m4, m7
27903pmaddubsw m5, m3, m6
27904pmulhrsw m5, m7
27905packuswb m4, m5
27906movu [r0 + 1477 * 16], m4
27907
27908; mode 25 [row 3]
27909movu m6, [r5 + 24 * 16]
27910pmaddubsw m4, m0, m6
27911pmulhrsw m4, m7
27912pmaddubsw m5, m2, m6
27913pmulhrsw m5, m7
27914packuswb m4, m5
27915movu [r0 + 1478 * 16], m4
27916pmaddubsw m4, m1, m6
27917pmulhrsw m4, m7
27918pmaddubsw m5, m3, m6
27919pmulhrsw m5, m7
27920packuswb m4, m5
27921movu [r0 + 1479 * 16], m4
27922
27923; mode 25 [row 4]
27924movu m6, [r5 + 22 * 16]
27925pmaddubsw m4, m0, m6
27926pmulhrsw m4, m7
27927pmaddubsw m5, m2, m6
27928pmulhrsw m5, m7
27929packuswb m4, m5
27930movu [r0 + 1480 * 16], m4
27931pmaddubsw m4, m1, m6
27932pmulhrsw m4, m7
27933pmaddubsw m5, m3, m6
27934pmulhrsw m5, m7
27935packuswb m4, m5
27936movu [r0 + 1481 * 16], m4
27937
27938; mode 25 [row 5]
27939movu m6, [r5 + 20 * 16]
27940pmaddubsw m4, m0, m6
27941pmulhrsw m4, m7
27942pmaddubsw m5, m2, m6
27943pmulhrsw m5, m7
27944packuswb m4, m5
27945movu [r0 + 1482 * 16], m4
27946pmaddubsw m4, m1, m6
27947pmulhrsw m4, m7
27948pmaddubsw m5, m3, m6
27949pmulhrsw m5, m7
27950packuswb m4, m5
27951movu [r0 + 1483 * 16], m4
27952
27953; mode 25 [row 6]
27954movu m6, [r5 + 18 * 16]
27955pmaddubsw m4, m0, m6
27956pmulhrsw m4, m7
27957pmaddubsw m5, m2, m6
27958pmulhrsw m5, m7
27959packuswb m4, m5
27960movu [r0 + 1484 * 16], m4
27961pmaddubsw m4, m1, m6
27962pmulhrsw m4, m7
27963pmaddubsw m5, m3, m6
27964pmulhrsw m5, m7
27965packuswb m4, m5
27966movu [r0 + 1485 * 16], m4
27967
27968; mode 25 [row 7]
27969movu m6, [r5 + 16 * 16]
27970pmaddubsw m4, m0, m6
27971pmulhrsw m4, m7
27972pmaddubsw m5, m2, m6
27973pmulhrsw m5, m7
27974packuswb m4, m5
27975movu [r0 + 1486 * 16], m4
27976pmaddubsw m4, m1, m6
27977pmulhrsw m4, m7
27978pmaddubsw m5, m3, m6
27979pmulhrsw m5, m7
27980packuswb m4, m5
27981movu [r0 + 1487 * 16], m4
27982
27983; mode 25 [row 8]
27984movu m6, [r5 + 14 * 16]
27985pmaddubsw m4, m0, m6
27986pmulhrsw m4, m7
27987pmaddubsw m5, m2, m6
27988pmulhrsw m5, m7
27989packuswb m4, m5
27990movu [r0 + 1488 * 16], m4
27991pmaddubsw m4, m1, m6
27992pmulhrsw m4, m7
27993pmaddubsw m5, m3, m6
27994pmulhrsw m5, m7
27995packuswb m4, m5
27996movu [r0 + 1489 * 16], m4
27997
27998; mode 25 [row 9]
27999movu m6, [r5 + 12 * 16]
28000pmaddubsw m4, m0, m6
28001pmulhrsw m4, m7
28002pmaddubsw m5, m2, m6
28003pmulhrsw m5, m7
28004packuswb m4, m5
28005movu [r0 + 1490 * 16], m4
28006pmaddubsw m4, m1, m6
28007pmulhrsw m4, m7
28008pmaddubsw m5, m3, m6
28009pmulhrsw m5, m7
28010packuswb m4, m5
28011movu [r0 + 1491 * 16], m4
28012
28013; mode 25 [row 10]
28014movu m6, [r5 + 10 * 16]
28015pmaddubsw m4, m0, m6
28016pmulhrsw m4, m7
28017pmaddubsw m5, m2, m6
28018pmulhrsw m5, m7
28019packuswb m4, m5
28020movu [r0 + 1492 * 16], m4
28021pmaddubsw m4, m1, m6
28022pmulhrsw m4, m7
28023pmaddubsw m5, m3, m6
28024pmulhrsw m5, m7
28025packuswb m4, m5
28026movu [r0 + 1493 * 16], m4
28027
28028; mode 25 [row 11]
28029movu m6, [r5 + 8 * 16]
28030pmaddubsw m4, m0, m6
28031pmulhrsw m4, m7
28032pmaddubsw m5, m2, m6
28033pmulhrsw m5, m7
28034packuswb m4, m5
28035movu [r0 + 1494 * 16], m4
28036pmaddubsw m4, m1, m6
28037pmulhrsw m4, m7
28038pmaddubsw m5, m3, m6
28039pmulhrsw m5, m7
28040packuswb m4, m5
28041movu [r0 + 1495 * 16], m4
28042
28043; mode 25 [row 12]
28044movu m6, [r5 + 6 * 16]
28045pmaddubsw m4, m0, m6
28046pmulhrsw m4, m7
28047pmaddubsw m5, m2, m6
28048pmulhrsw m5, m7
28049packuswb m4, m5
28050movu [r0 + 1496 * 16], m4
28051pmaddubsw m4, m1, m6
28052pmulhrsw m4, m7
28053pmaddubsw m5, m3, m6
28054pmulhrsw m5, m7
28055packuswb m4, m5
28056movu [r0 + 1497 * 16], m4
28057
28058; mode 25 [row 13]
28059movu m6, [r5 + 4 * 16]
28060pmaddubsw m4, m0, m6
28061pmulhrsw m4, m7
28062pmaddubsw m5, m2, m6
28063pmulhrsw m5, m7
28064packuswb m4, m5
28065movu [r0 + 1498 * 16], m4
28066pmaddubsw m4, m1, m6
28067pmulhrsw m4, m7
28068pmaddubsw m5, m3, m6
28069pmulhrsw m5, m7
28070packuswb m4, m5
28071movu [r0 + 1499 * 16], m4
28072
28073; mode 25 [row 14]
28074movu m6, [r5 + 2 * 16]
28075pmaddubsw m4, m0, m6
28076pmulhrsw m4, m7
28077pmaddubsw m5, m2, m6
28078pmulhrsw m5, m7
28079packuswb m4, m5
28080movu [r0 + 1500 * 16], m4
28081pmaddubsw m4, m1, m6
28082pmulhrsw m4, m7
28083pmaddubsw m5, m3, m6
28084pmulhrsw m5, m7
28085packuswb m4, m5
28086movu [r0 + 1501 * 16], m4
28087
28088; mode 25 [row 15]
28089pshufb m5, m0, [tab_S2]
28090movh [r0 + 1502 * 16], m5
28091pshufb m5, m2, [tab_S2]
28092movh [r0 + 1502 * 16 + 8], m5
28093pshufb m5, m1, [tab_S2]
28094movh [r0 + 1503 * 16], m5
28095pshufb m5, m3, [tab_S2]
28096movh [r0 + 1503 * 16 + 8], m5
28097
28098; mode 25 [row 16]
28099movu m6, [r5 + 30 * 16]
28100pslldq m0, 2
28101pinsrb m0, [r4 + 0], 1
28102pinsrb m0, [r4 + 16], 0
28103pmaddubsw m4, m0, m6
28104pmulhrsw m4, m7
28105pslldq m2, 2
28106pinsrw m2, [r3 + 7], 0
28107pmaddubsw m5, m2, m6
28108pmulhrsw m5, m7
28109packuswb m4, m5
28110movu [r0 + 1504 * 16], m4
28111pslldq m1, 2
28112pinsrw m1, [r3 + 15], 0
28113pmaddubsw m4, m1, m6
28114pmulhrsw m4, m7
28115pslldq m3, 2
28116pinsrw m3, [r3 + 23], 0
28117pmaddubsw m5, m3, m6
28118pmulhrsw m5, m7
28119packuswb m4, m5
28120movu [r0 + 1505 * 16], m4
28121
28122; mode 25 [row 17]
28123movu m6, [r5 + 28 * 16]
28124pmaddubsw m4, m0, m6
28125pmulhrsw m4, m7
28126pmaddubsw m5, m2, m6
28127pmulhrsw m5, m7
28128packuswb m4, m5
28129movu [r0 + 1506 * 16], m4
28130pmaddubsw m4, m1, m6
28131pmulhrsw m4, m7
28132pmaddubsw m5, m3, m6
28133pmulhrsw m5, m7
28134packuswb m4, m5
28135movu [r0 + 1507 * 16], m4
28136
28137; mode 25 [row 18]
28138movu m6, [r5 + 26 * 16]
28139pmaddubsw m4, m0, m6
28140pmulhrsw m4, m7
28141pmaddubsw m5, m2, m6
28142pmulhrsw m5, m7
28143packuswb m4, m5
28144movu [r0 + 1508 * 16], m4
28145pmaddubsw m4, m1, m6
28146pmulhrsw m4, m7
28147pmaddubsw m5, m3, m6
28148pmulhrsw m5, m7
28149packuswb m4, m5
28150movu [r0 + 1509 * 16], m4
28151
28152; mode 25 [row 19]
28153movu m6, [r5 + 24 * 16]
28154pmaddubsw m4, m0, m6
28155pmulhrsw m4, m7
28156pmaddubsw m5, m2, m6
28157pmulhrsw m5, m7
28158packuswb m4, m5
28159movu [r0 + 1510 * 16], m4
28160pmaddubsw m4, m1, m6
28161pmulhrsw m4, m7
28162pmaddubsw m5, m3, m6
28163pmulhrsw m5, m7
28164packuswb m4, m5
28165movu [r0 + 1511 * 16], m4
28166
28167; mode 25 [row 20]
28168movu m6, [r5 + 22 * 16]
28169pmaddubsw m4, m0, m6
28170pmulhrsw m4, m7
28171pmaddubsw m5, m2, m6
28172pmulhrsw m5, m7
28173packuswb m4, m5
28174movu [r0 + 1512 * 16], m4
28175pmaddubsw m4, m1, m6
28176pmulhrsw m4, m7
28177pmaddubsw m5, m3, m6
28178pmulhrsw m5, m7
28179packuswb m4, m5
28180movu [r0 + 1513 * 16], m4
28181
28182; mode 25 [row 21]
28183movu m6, [r5 + 20 * 16]
28184pmaddubsw m4, m0, m6
28185pmulhrsw m4, m7
28186pmaddubsw m5, m2, m6
28187pmulhrsw m5, m7
28188packuswb m4, m5
28189movu [r0 + 1514 * 16], m4
28190pmaddubsw m4, m1, m6
28191pmulhrsw m4, m7
28192pmaddubsw m5, m3, m6
28193pmulhrsw m5, m7
28194packuswb m4, m5
28195movu [r0 + 1515 * 16], m4
28196
28197; mode 25 [row 22]
28198movu m6, [r5 + 18 * 16]
28199pmaddubsw m4, m0, m6
28200pmulhrsw m4, m7
28201pmaddubsw m5, m2, m6
28202pmulhrsw m5, m7
28203packuswb m4, m5
28204movu [r0 + 1516 * 16], m4
28205pmaddubsw m4, m1, m6
28206pmulhrsw m4, m7
28207pmaddubsw m5, m3, m6
28208pmulhrsw m5, m7
28209packuswb m4, m5
28210movu [r0 + 1517 * 16], m4
28211
28212; mode 25 [row 23]
28213movu m6, [r5 + 16 * 16]
28214pmaddubsw m4, m0, m6
28215pmulhrsw m4, m7
28216pmaddubsw m5, m2, m6
28217pmulhrsw m5, m7
28218packuswb m4, m5
28219movu [r0 + 1518 * 16], m4
28220pmaddubsw m4, m1, m6
28221pmulhrsw m4, m7
28222pmaddubsw m5, m3, m6
28223pmulhrsw m5, m7
28224packuswb m4, m5
28225movu [r0 + 1519 * 16], m4
28226
28227; mode 25 [row 24]
28228movu m6, [r5 + 14 * 16]
28229pmaddubsw m4, m0, m6
28230pmulhrsw m4, m7
28231pmaddubsw m5, m2, m6
28232pmulhrsw m5, m7
28233packuswb m4, m5
28234movu [r0 + 1520 * 16], m4
28235pmaddubsw m4, m1, m6
28236pmulhrsw m4, m7
28237pmaddubsw m5, m3, m6
28238pmulhrsw m5, m7
28239packuswb m4, m5
28240movu [r0 + 1521 * 16], m4
28241
28242; mode 25 [row 25]
28243movu m6, [r5 + 12 * 16]
28244pmaddubsw m4, m0, m6
28245pmulhrsw m4, m7
28246pmaddubsw m5, m2, m6
28247pmulhrsw m5, m7
28248packuswb m4, m5
28249movu [r0 + 1522 * 16], m4
28250pmaddubsw m4, m1, m6
28251pmulhrsw m4, m7
28252pmaddubsw m5, m3, m6
28253pmulhrsw m5, m7
28254packuswb m4, m5
28255movu [r0 + 1523 * 16], m4
28256
28257; mode 25 [row 26]
28258movu m6, [r5 + 10 * 16]
28259pmaddubsw m4, m0, m6
28260pmulhrsw m4, m7
28261pmaddubsw m5, m2, m6
28262pmulhrsw m5, m7
28263packuswb m4, m5
28264movu [r0 + 1524 * 16], m4
28265pmaddubsw m4, m1, m6
28266pmulhrsw m4, m7
28267pmaddubsw m5, m3, m6
28268pmulhrsw m5, m7
28269packuswb m4, m5
28270movu [r0 + 1525 * 16], m4
28271
28272; mode 25 [row 27]
28273movu m6, [r5 + 8 * 16]
28274pmaddubsw m4, m0, m6
28275pmulhrsw m4, m7
28276pmaddubsw m5, m2, m6
28277pmulhrsw m5, m7
28278packuswb m4, m5
28279movu [r0 + 1526 * 16], m4
28280pmaddubsw m4, m1, m6
28281pmulhrsw m4, m7
28282pmaddubsw m5, m3, m6
28283pmulhrsw m5, m7
28284packuswb m4, m5
28285movu [r0 + 1527 * 16], m4
28286
28287; mode 25 [row 28]
28288movu m6, [r5 + 6 * 16]
28289pmaddubsw m4, m0, m6
28290pmulhrsw m4, m7
28291pmaddubsw m5, m2, m6
28292pmulhrsw m5, m7
28293packuswb m4, m5
28294movu [r0 + 1528 * 16], m4
28295pmaddubsw m4, m1, m6
28296pmulhrsw m4, m7
28297pmaddubsw m5, m3, m6
28298pmulhrsw m5, m7
28299packuswb m4, m5
28300movu [r0 + 1529 * 16], m4
28301
28302; mode 25 [row 29]
28303movu m6, [r5 + 4 * 16]
28304pmaddubsw m4, m0, m6
28305pmulhrsw m4, m7
28306pmaddubsw m5, m2, m6
28307pmulhrsw m5, m7
28308packuswb m4, m5
28309movu [r0 + 1530 * 16], m4
28310pmaddubsw m4, m1, m6
28311pmulhrsw m4, m7
28312pmaddubsw m5, m3, m6
28313pmulhrsw m5, m7
28314packuswb m4, m5
28315movu [r0 + 1531 * 16], m4
28316
28317; mode 25 [row 30]
28318movu m6, [r5 + 2 * 16]
28319pmaddubsw m4, m0, m6
28320pmulhrsw m4, m7
28321pmaddubsw m5, m2, m6
28322pmulhrsw m5, m7
28323packuswb m4, m5
28324movu [r0 + 1532 * 16], m4
28325pmaddubsw m4, m1, m6
28326pmulhrsw m4, m7
28327pmaddubsw m5, m3, m6
28328pmulhrsw m5, m7
28329packuswb m4, m5
28330movu [r0 + 1533 * 16], m4
28331
28332; mode 25 [row 31]
28333pshufb m5, m0, [tab_S2]
28334movh [r0 + 1534 * 16], m5
28335pshufb m5, m2, [tab_S2]
28336movh [r0 + 1534 * 16 + 8], m5
28337pshufb m5, m1, [tab_S2]
28338movh [r0 + 1535 * 16], m5
28339pshufb m5, m3, [tab_S2]
28340movh [r0 + 1535 * 16 + 8], m5
28341
28342; mode 26
28343movu m1, [r1 + 1]
28344movu m2, [r1 + 17]
28345movu [r0 + 1536 * 16], m1
28346movu [r0 + 1537 * 16], m2
28347movu [r0 + 1538 * 16], m1
28348movu [r0 + 1539 * 16], m2
28349movu [r0 + 1540 * 16], m1
28350movu [r0 + 1541 * 16], m2
28351movu [r0 + 1542 * 16], m1
28352movu [r0 + 1543 * 16], m2
28353movu [r0 + 1544 * 16], m1
28354movu [r0 + 1545 * 16], m2
28355movu [r0 + 1546 * 16], m1
28356movu [r0 + 1547 * 16], m2
28357movu [r0 + 1548 * 16], m1
28358movu [r0 + 1549 * 16], m2
28359movu [r0 + 1550 * 16], m1
28360movu [r0 + 1551 * 16], m2
28361
28362movu [r0 + 1552 * 16], m1
28363movu [r0 + 1553 * 16], m2
28364movu [r0 + 1554 * 16], m1
28365movu [r0 + 1555 * 16], m2
28366movu [r0 + 1556 * 16], m1
28367movu [r0 + 1557 * 16], m2
28368movu [r0 + 1558 * 16], m1
28369movu [r0 + 1559 * 16], m2
28370movu [r0 + 1560 * 16], m1
28371movu [r0 + 1561 * 16], m2
28372movu [r0 + 1562 * 16], m1
28373movu [r0 + 1563 * 16], m2
28374movu [r0 + 1564 * 16], m1
28375movu [r0 + 1565 * 16], m2
28376movu [r0 + 1566 * 16], m1
28377movu [r0 + 1567 * 16], m2
28378
28379movu [r0 + 1568 * 16], m1
28380movu [r0 + 1569 * 16], m2
28381movu [r0 + 1570 * 16], m1
28382movu [r0 + 1571 * 16], m2
28383movu [r0 + 1572 * 16], m1
28384movu [r0 + 1573 * 16], m2
28385movu [r0 + 1574 * 16], m1
28386movu [r0 + 1575 * 16], m2
28387movu [r0 + 1576 * 16], m1
28388movu [r0 + 1577 * 16], m2
28389movu [r0 + 1578 * 16], m1
28390movu [r0 + 1579 * 16], m2
28391movu [r0 + 1580 * 16], m1
28392movu [r0 + 1581 * 16], m2
28393movu [r0 + 1582 * 16], m1
28394movu [r0 + 1583 * 16], m2
28395
28396movu [r0 + 1584 * 16], m1
28397movu [r0 + 1585 * 16], m2
28398movu [r0 + 1586 * 16], m1
28399movu [r0 + 1587 * 16], m2
28400movu [r0 + 1588 * 16], m1
28401movu [r0 + 1589 * 16], m2
28402movu [r0 + 1590 * 16], m1
28403movu [r0 + 1591 * 16], m2
28404movu [r0 + 1592 * 16], m1
28405movu [r0 + 1593 * 16], m2
28406movu [r0 + 1594 * 16], m1
28407movu [r0 + 1595 * 16], m2
28408movu [r0 + 1596 * 16], m1
28409movu [r0 + 1597 * 16], m2
28410movu [r0 + 1598 * 16], m1
28411movu [r0 + 1599 * 16], m2
28412
28413; mode 27 [row 0]
28414movu m6, [r5 + 2 * 16]
28415movu m0, [r3 + 1 ]
28416movu m1, [r3 + 2 ]
28417punpcklbw m0, m1
28418pmaddubsw m4, m0, m6
28419pmulhrsw m4, m7
28420movu m2, [r3 + 9]
28421movu m3, [r3 + 10]
28422punpcklbw m2, m3
28423pmaddubsw m5, m2, m6
28424pmulhrsw m5, m7
28425packuswb m4, m5
28426movu [r0 + 1600 * 16], m4
28427
28428movu m1, [r3 + 17]
28429movu m3, [r3 + 18]
28430punpcklbw m1, m3
28431pmaddubsw m4, m1, m6
28432pmulhrsw m4, m7
28433movu m3, [r3 + 25]
28434movu m5, [r3 + 26]
28435punpcklbw m3, m5
28436pmaddubsw m5, m3, m6
28437pmulhrsw m5, m7
28438packuswb m4, m5
28439movu [r0 + 1601 * 16], m4
28440
28441; mode 27 [row 1]
28442movu m6, [r5 + 4 * 16]
28443pmaddubsw m4, m0, m6
28444pmulhrsw m4, m7
28445pmaddubsw m5, m2, m6
28446pmulhrsw m5, m7
28447packuswb m4, m5
28448movu [r0 + 1602 * 16], m4
28449pmaddubsw m4, m1, m6
28450pmulhrsw m4, m7
28451pmaddubsw m5, m3, m6
28452pmulhrsw m5, m7
28453packuswb m4, m5
28454movu [r0 + 1603 * 16], m4
28455
28456; mode 27 [row 2]
28457movu m6, [r5 + 6 * 16]
28458pmaddubsw m4, m0, m6
28459pmulhrsw m4, m7
28460pmaddubsw m5, m2, m6
28461pmulhrsw m5, m7
28462packuswb m4, m5
28463movu [r0 + 1604 * 16], m4
28464pmaddubsw m4, m1, m6
28465pmulhrsw m4, m7
28466pmaddubsw m5, m3, m6
28467pmulhrsw m5, m7
28468packuswb m4, m5
28469movu [r0 + 1605 * 16], m4
28470
28471; mode 27 [row 3]
28472movu m6, [r5 + 8 * 16]
28473pmaddubsw m4, m0, m6
28474pmulhrsw m4, m7
28475pmaddubsw m5, m2, m6
28476pmulhrsw m5, m7
28477packuswb m4, m5
28478movu [r0 + 1606 * 16], m4
28479pmaddubsw m4, m1, m6
28480pmulhrsw m4, m7
28481pmaddubsw m5, m3, m6
28482pmulhrsw m5, m7
28483packuswb m4, m5
28484movu [r0 + 1607 * 16], m4
28485
28486; mode 27 [row 4]
28487movu m6, [r5 + 10 * 16]
28488pmaddubsw m4, m0, m6
28489pmulhrsw m4, m7
28490pmaddubsw m5, m2, m6
28491pmulhrsw m5, m7
28492packuswb m4, m5
28493movu [r0 + 1608 * 16], m4
28494
28495; mode 28 [row 1 -first half]
28496movu [r0 + 1666 * 16], m4
28497
28498pmaddubsw m4, m1, m6
28499pmulhrsw m4, m7
28500pmaddubsw m5, m3, m6
28501pmulhrsw m5, m7
28502packuswb m4, m5
28503movu [r0 + 1609 * 16], m4
28504
28505; mode 28 [row 1 - second half]
28506movu [r0 + 1667 * 16], m4
28507
28508; mode 27 [row 5]
28509movu m6, [r5 + 12 * 16]
28510pmaddubsw m4, m0, m6
28511pmulhrsw m4, m7
28512pmaddubsw m5, m2, m6
28513pmulhrsw m5, m7
28514packuswb m4, m5
28515movu [r0 + 1610 * 16], m4
28516
28517pmaddubsw m4, m1, m6
28518pmulhrsw m4, m7
28519pmaddubsw m5, m3, m6
28520pmulhrsw m5, m7
28521packuswb m4, m5
28522movu [r0 + 1611 * 16], m4
28523
28524; mode 27 [row 6]
28525movu m6, [r5 + 14 * 16]
28526pmaddubsw m4, m0, m6
28527pmulhrsw m4, m7
28528pmaddubsw m5, m2, m6
28529pmulhrsw m5, m7
28530packuswb m4, m5
28531movu [r0 + 1612 * 16], m4
28532pmaddubsw m4, m1, m6
28533pmulhrsw m4, m7
28534pmaddubsw m5, m3, m6
28535pmulhrsw m5, m7
28536packuswb m4, m5
28537movu [r0 + 1613 * 16], m4
28538
28539; mode 27 [row 7]
28540movu m6, [r5 + 16 * 16]
28541pmaddubsw m4, m0, m6
28542pmulhrsw m4, m7
28543pmaddubsw m5, m2, m6
28544pmulhrsw m5, m7
28545packuswb m4, m5
28546movu [r0 + 1614 * 16], m4
28547pmaddubsw m4, m1, m6
28548pmulhrsw m4, m7
28549pmaddubsw m5, m3, m6
28550pmulhrsw m5, m7
28551packuswb m4, m5
28552movu [r0 + 1615 * 16], m4
28553
28554; mode 27 [row 8]
28555movu m6, [r5 + 18 * 16]
28556pmaddubsw m4, m0, m6
28557pmulhrsw m4, m7
28558pmaddubsw m5, m2, m6
28559pmulhrsw m5, m7
28560packuswb m4, m5
28561movu [r0 + 1616 * 16], m4
28562
28563; mode 29 [row 1 - first half]
28564movu [r0 + 1730 * 16], m4
28565
28566pmaddubsw m4, m1, m6
28567pmulhrsw m4, m7
28568pmaddubsw m5, m3, m6
28569pmulhrsw m5, m7
28570packuswb m4, m5
28571movu [r0 + 1617 * 16], m4
28572
28573; mode 29 [row 1 - second half]
28574movu [r0 + 1731 * 16], m4
28575
28576; mode 27 [row 9]
28577movu m6, [r5 + 20 * 16]
28578pmaddubsw m4, m0, m6
28579pmulhrsw m4, m7
28580pmaddubsw m5, m2, m6
28581pmulhrsw m5, m7
28582packuswb m4, m5
28583movu [r0 + 1618 * 16], m4
28584
28585; mode 28 [row 3 -first half]
28586movu [r0 + 1670 * 16], m4
28587
28588pmaddubsw m4, m1, m6
28589pmulhrsw m4, m7
28590pmaddubsw m5, m3, m6
28591pmulhrsw m5, m7
28592packuswb m4, m5
28593movu [r0 + 1619 * 16], m4
28594
28595; mode 28 [row 3 -second half]
28596movu [r0 + 1671 * 16], m4
28597
28598; mode 27 [row 10]
28599movu m6, [r5 + 22 * 16]
28600pmaddubsw m4, m0, m6
28601pmulhrsw m4, m7
28602pmaddubsw m5, m2, m6
28603pmulhrsw m5, m7
28604packuswb m4, m5
28605movu [r0 + 1620 * 16], m4
28606pmaddubsw m4, m1, m6
28607pmulhrsw m4, m7
28608pmaddubsw m5, m3, m6
28609pmulhrsw m5, m7
28610packuswb m4, m5
28611movu [r0 + 1621 * 16], m4
28612
28613; mode 27 [row 11]
28614movu m6, [r5 + 24 * 16]
28615pmaddubsw m4, m0, m6
28616pmulhrsw m4, m7
28617pmaddubsw m5, m2, m6
28618pmulhrsw m5, m7
28619packuswb m4, m5
28620movu [r0 + 1622 * 16], m4
28621pmaddubsw m4, m1, m6
28622pmulhrsw m4, m7
28623pmaddubsw m5, m3, m6
28624pmulhrsw m5, m7
28625packuswb m4, m5
28626movu [r0 + 1623 * 16], m4
28627
28628; mode 27 [row 12]
28629movu m6, [r5 + 26 * 16]
28630pmaddubsw m4, m0, m6
28631pmulhrsw m4, m7
28632pmaddubsw m5, m2, m6
28633pmulhrsw m5, m7
28634packuswb m4, m5
28635movu [r0 + 1624 * 16], m4
28636
28637; mode 30 [row 1 - first half]
28638movu [r0 + 1794 * 16], m4
28639
28640; mode 33 [row 0 - first half]
28641movu [r0 + 1984 * 16], m4
28642
28643pmaddubsw m4, m1, m6
28644pmulhrsw m4, m7
28645pmaddubsw m5, m3, m6
28646pmulhrsw m5, m7
28647packuswb m4, m5
28648movu [r0 + 1625 * 16], m4
28649
28650; mode 30 [row 1 - second half]
28651movu [r0 + 1795 * 16], m4
28652
28653; mode 33 [row 0 - second half]
28654movu [r0 + 1985 * 16], m4
28655
28656; mode 27 [row 13]
28657movu m6, [r5 + 28 * 16]
28658pmaddubsw m4, m0, m6
28659pmulhrsw m4, m7
28660pmaddubsw m5, m2, m6
28661pmulhrsw m5, m7
28662packuswb m4, m5
28663movu [r0 + 1626 * 16], m4
28664pmaddubsw m4, m1, m6
28665pmulhrsw m4, m7
28666pmaddubsw m5, m3, m6
28667pmulhrsw m5, m7
28668packuswb m4, m5
28669movu [r0 + 1627 * 16], m4
28670
28671; mode 27 [row 14]
28672movu m6, [r5 + 30 * 16]
28673pmaddubsw m4, m0, m6
28674pmulhrsw m4, m7
28675pmaddubsw m5, m2, m6
28676pmulhrsw m5, m7
28677packuswb m4, m5
28678movu [r0 + 1628 * 16], m4
28679
28680; mode 28 [row 5 first half]
28681movu [r0 + 1674 * 16], m4
28682
28683pmaddubsw m4, m1, m6
28684pmulhrsw m4, m7
28685pmaddubsw m5, m3, m6
28686pmulhrsw m5, m7
28687packuswb m4, m5
28688movu [r0 + 1629 * 16], m4
28689
28690; mode 28 [row 5 second half]
28691movu [r0 + 1675 * 16], m4
28692
28693; mode 28 [row 0]
28694movu m6, [r5 + 5 * 16]
28695pmaddubsw m4, m0, m6
28696pmulhrsw m4, m7
28697pmaddubsw m5, m2, m6
28698pmulhrsw m5, m7
28699packuswb m4, m5
28700movu [r0 + 1664 * 16], m4
28701pmaddubsw m4, m1, m6
28702pmulhrsw m4, m7
28703pmaddubsw m5, m3, m6
28704pmulhrsw m5, m7
28705packuswb m4, m5
28706movu [r0 + 1665 * 16], m4
28707
28708; mode 28 [row 2]
28709movu m6, [r5 + 15 * 16]
28710pmaddubsw m4, m0, m6
28711pmulhrsw m4, m7
28712pmaddubsw m5, m2, m6
28713pmulhrsw m5, m7
28714packuswb m4, m5
28715movu [r0 + 1668 * 16], m4
28716pmaddubsw m4, m1, m6
28717pmulhrsw m4, m7
28718pmaddubsw m5, m3, m6
28719pmulhrsw m5, m7
28720packuswb m4, m5
28721movu [r0 + 1669 * 16], m4
28722
28723; mode 28 [row 4]
28724movu m6, [r5 + 25 * 16]
28725pmaddubsw m4, m0, m6
28726pmulhrsw m4, m7
28727pmaddubsw m5, m2, m6
28728pmulhrsw m5, m7
28729packuswb m4, m5
28730movu [r0 + 1672 * 16], m4
28731pmaddubsw m4, m1, m6
28732pmulhrsw m4, m7
28733pmaddubsw m5, m3, m6
28734pmulhrsw m5, m7
28735packuswb m4, m5
28736movu [r0 + 1673 * 16], m4
28737
28738; mode 30 [row 0]
28739movu m6, [r5 + 13 * 16]
28740pmaddubsw m4, m0, m6
28741pmulhrsw m4, m7
28742pmaddubsw m5, m2, m6
28743pmulhrsw m5, m7
28744packuswb m4, m5
28745movu [r0 + 1792 * 16], m4
28746pmaddubsw m4, m1, m6
28747pmulhrsw m4, m7
28748pmaddubsw m5, m3, m6
28749pmulhrsw m5, m7
28750packuswb m4, m5
28751movu [r0 + 1793 * 16], m4
28752
28753; mode 29 [row 0]
28754movu m6, [r5 + 9 * 16]
28755pmaddubsw m4, m0, m6
28756pmulhrsw m4, m7
28757pmaddubsw m5, m2, m6
28758pmulhrsw m5, m7
28759packuswb m4, m5
28760movu [r0 + 1728 * 16], m4
28761pmaddubsw m4, m1, m6
28762pmulhrsw m4, m7
28763pmaddubsw m5, m3, m6
28764pmulhrsw m5, m7
28765packuswb m4, m5
28766movu [r0 + 1729 * 16], m4
28767
28768; mode 29 [row 2]
28769movu m6, [r5 + 27 * 16]
28770pmaddubsw m4, m0, m6
28771pmulhrsw m4, m7
28772pmaddubsw m5, m2, m6
28773pmulhrsw m5, m7
28774packuswb m4, m5
28775movu [r0 + 1732 * 16], m4
28776pmaddubsw m4, m1, m6
28777pmulhrsw m4, m7
28778pmaddubsw m5, m3, m6
28779pmulhrsw m5, m7
28780packuswb m4, m5
28781movu [r0 + 1733 * 16], m4
28782
28783; mode 31 [row 0]
28784movu m6, [r5 + 17 * 16]
28785pmaddubsw m4, m0, m6
28786pmulhrsw m4, m7
28787pmaddubsw m5, m2, m6
28788pmulhrsw m5, m7
28789packuswb m4, m5
28790movu [r0 + 1856 * 16], m4
28791pmaddubsw m4, m1, m6
28792pmulhrsw m4, m7
28793pmaddubsw m5, m3, m6
28794pmulhrsw m5, m7
28795packuswb m4, m5
28796movu [r0 + 1857 * 16], m4
28797
28798; mode 32 [row 0]
28799movu m6, [r5 + 21 * 16]
28800pmaddubsw m4, m0, m6
28801pmulhrsw m4, m7
28802pmaddubsw m5, m2, m6
28803pmulhrsw m5, m7
28804packuswb m4, m5
28805movu [r0 + 1920 * 16], m4
28806pmaddubsw m4, m1, m6
28807pmulhrsw m4, m7
28808pmaddubsw m5, m3, m6
28809pmulhrsw m5, m7
28810packuswb m4, m5
28811movu [r0 + 1921 * 16], m4
28812
28813; mode 27 [row 15]
28814movu m0, [r3 + 2]
28815movd m1, [r3 + 3]
28816palignr m1, m0, 1
28817punpcklbw m0, m1
28818movu m2, [r3 + 10]
28819movd m3, [r3 + 11]
28820palignr m3, m2, 1
28821punpcklbw m2, m3
28822movu m1, [r3 + 18]
28823movd m3, [r3 + 19]
28824palignr m3, m1, 1
28825punpcklbw m1, m3
28826movu m4, [r3 + 26]
28827movd m5, [r3 + 27]
28828palignr m5, m4, 1
28829punpcklbw m4, m5
28830
28831pshufb m5, m0, [tab_S2]
28832movh [r0 + 1630 * 16], m5
28833pshufb m5, m2, [tab_S2]
28834movh [r0 + 1630 * 16 + 8], m5
28835pshufb m5, m1, [tab_S2]
28836movh [r0 + 1631 * 16], m5
28837pshufb m5, m4, [tab_S2]
28838movh [r0 + 1631 * 16 + 8], m5
28839
28840; mode 27 [row 16]
28841movu m6, [r5 + 2 * 16]
28842pmaddubsw m3, m0, m6
28843pmulhrsw m3, m7
28844pmaddubsw m5, m2, m6
28845pmulhrsw m5, m7
28846packuswb m3, m5
28847movu [r0 + 1632 * 16], m3
28848
28849; mode 31 [row 1 - first half]
28850movu [r0 + 1858 * 16], m3
28851
28852pmaddubsw m3, m1, m6
28853pmulhrsw m3, m7
28854pmaddubsw m5, m4, m6
28855pmulhrsw m5, m7
28856packuswb m3, m5
28857movu [r0 + 1633 * 16], m3
28858
28859; mode 31 [row 1 - second half]
28860movu [r0 + 1859 * 16], m3
28861
28862; mode 27 [row 17]
28863movu m6, [r5 + 4 * 16]
28864pmaddubsw m3, m0, m6
28865pmulhrsw m3, m7
28866pmaddubsw m5, m2, m6
28867pmulhrsw m5, m7
28868packuswb m3, m5
28869movu [r0 + 1634 * 16], m3
28870
28871; mode 29 [row 3 - first half]
28872movu [r0 + 1734 * 16], m3
28873
28874pmaddubsw m3, m1, m6
28875pmulhrsw m3, m7
28876pmaddubsw m5, m4, m6
28877pmulhrsw m5, m7
28878packuswb m3, m5
28879movu [r0 + 1635 * 16], m3
28880
28881; mode 29 [row 3 - second half]
28882movu [r0 + 1735 * 16], m3
28883
28884; mode 27 [row 18]
28885movu m6, [r5 + 6 * 16]
28886pmaddubsw m3, m0, m6
28887pmulhrsw m3, m7
28888pmaddubsw m5, m2, m6
28889pmulhrsw m5, m7
28890packuswb m3, m5
28891movu [r0 + 1636 * 16], m3
28892pmaddubsw m3, m1, m6
28893pmulhrsw m3, m7
28894pmaddubsw m5, m4, m6
28895pmulhrsw m5, m7
28896packuswb m3, m5
28897movu [r0 + 1637 * 16], m3
28898
28899; mode 27 [row 19]
28900movu m6, [r5 + 8 * 16]
28901pmaddubsw m3, m0, m6
28902pmulhrsw m3, m7
28903pmaddubsw m5, m2, m6
28904pmulhrsw m5, m7
28905packuswb m3, m5
28906movu [r0 + 1638 * 16], m3
28907
28908; mode 28 [row 7 - first half]
28909movu [r0 + 1678 * 16], m3
28910
28911pmaddubsw m3, m1, m6
28912pmulhrsw m3, m7
28913pmaddubsw m5, m4, m6
28914pmulhrsw m5, m7
28915packuswb m3, m5
28916movu [r0 + 1639 * 16], m3
28917
28918; mode 28 [row 7 - second half]
28919movu [r0 + 1679 * 16], m3
28920
28921; mode 27 [row 20]
28922movu m6, [r5 + 10 * 16]
28923pmaddubsw m3, m0, m6
28924pmulhrsw m3, m7
28925pmaddubsw m5, m2, m6
28926pmulhrsw m5, m7
28927packuswb m3, m5
28928movu [r0 + 1640 * 16], m3
28929
28930; mode 32 [row 1 - first half]
28931movu [r0 + 1922 * 16], m3
28932
28933pmaddubsw m3, m1, m6
28934pmulhrsw m3, m7
28935pmaddubsw m5, m4, m6
28936pmulhrsw m5, m7
28937packuswb m3, m5
28938movu [r0 + 1641 * 16], m3
28939
28940; mode 32 [row 1 - second half]
28941movu [r0 + 1923 * 16], m3
28942
28943; mode 27 [row 21]
28944movu m6, [r5 + 12 * 16]
28945pmaddubsw m3, m0, m6
28946pmulhrsw m3, m7
28947pmaddubsw m5, m2, m6
28948pmulhrsw m5, m7
28949packuswb m3, m5
28950movu [r0 + 1642 * 16], m3
28951pmaddubsw m3, m1, m6
28952pmulhrsw m3, m7
28953pmaddubsw m5, m4, m6
28954pmulhrsw m5, m7
28955packuswb m3, m5
28956movu [r0 + 1643 * 16], m3
28957
28958; mode 27 [row 22]
28959movu m6, [r5 + 14 * 16]
28960pmaddubsw m3, m0, m6
28961pmulhrsw m3, m7
28962pmaddubsw m5, m2, m6
28963pmulhrsw m5, m7
28964packuswb m3, m5
28965movu [r0 + 1644 * 16], m3
28966pmaddubsw m3, m1, m6
28967pmulhrsw m3, m7
28968pmaddubsw m5, m4, m6
28969pmulhrsw m5, m7
28970packuswb m3, m5
28971movu [r0 + 1645 * 16], m3
28972
28973; mode 27 [row 23]
28974movu m6, [r5 + 16 * 16]
28975pmaddubsw m3, m0, m6
28976pmulhrsw m3, m7
28977pmaddubsw m5, m2, m6
28978pmulhrsw m5, m7
28979packuswb m3, m5
28980movu [r0 + 1646 * 16], m3
28981pmaddubsw m3, m1, m6
28982pmulhrsw m3, m7
28983pmaddubsw m5, m4, m6
28984pmulhrsw m5, m7
28985packuswb m3, m5
28986movu [r0 + 1647 * 16], m3
28987
28988; mode 27 [row 24]
28989movu m6, [r5 + 18 * 16]
28990pmaddubsw m3, m0, m6
28991pmulhrsw m3, m7
28992pmaddubsw m5, m2, m6
28993pmulhrsw m5, m7
28994packuswb m3, m5
28995movu [r0 + 1648 * 16], m3
28996
28997; mode 28 [row 9 - first half]
28998movu [r0 + 1682 * 16], m3
28999
29000pmaddubsw m3, m1, m6
29001pmulhrsw m3, m7
29002pmaddubsw m5, m4, m6
29003pmulhrsw m5, m7
29004packuswb m3, m5
29005movu [r0 + 1649 * 16], m3
29006
29007; mode 28 [row 9 - second half]
29008movu [r0 + 1683 * 16], m3
29009
29010; mode 27 [row 25]
29011movu m6, [r5 + 20 * 16]
29012pmaddubsw m3, m0, m6
29013pmulhrsw m3, m7
29014pmaddubsw m5, m2, m6
29015pmulhrsw m5, m7
29016packuswb m3, m5
29017movu [r0 + 1650 * 16], m3
29018
29019; mode 30 [row 3 - first half]
29020movu [r0 + 1798 * 16], m3
29021
29022; mode 33 [row 1 - first half]
29023movu [r0 + 1986 * 16], m3
29024
29025pmaddubsw m3, m1, m6
29026pmulhrsw m3, m7
29027pmaddubsw m5, m4, m6
29028pmulhrsw m5, m7
29029packuswb m3, m5
29030movu [r0 + 1651 * 16], m3
29031
29032; mode 30 [row 3 - second half]
29033movu [r0 + 1799 * 16], m3
29034
29035; mode 33 [row 1 - second half]
29036movu [r0 + 1987 * 16], m3
29037
29038; mode 27 [row 26]
29039movu m6, [r5 + 22 * 16]
29040pmaddubsw m3, m0, m6
29041pmulhrsw m3, m7
29042pmaddubsw m5, m2, m6
29043pmulhrsw m5, m7
29044packuswb m3, m5
29045movu [r0 + 1652 * 16], m3
29046
29047; mode 29 [row 5 - first half]
29048movu [r0 + 1738 * 16], m3
29049
29050pmaddubsw m3, m1, m6
29051pmulhrsw m3, m7
29052pmaddubsw m5, m4, m6
29053pmulhrsw m5, m7
29054packuswb m3, m5
29055movu [r0 + 1653 * 16], m3
29056
29057; mode 29 [row 5 - second half]
29058movu [r0 + 1739 * 16], m3
29059
29060; mode 27 [row 27]
29061movu m6, [r5 + 24 * 16]
29062pmaddubsw m3, m0, m6
29063pmulhrsw m3, m7
29064pmaddubsw m5, m2, m6
29065pmulhrsw m5, m7
29066packuswb m3, m5
29067movu [r0 + 1654 * 16], m3
29068pmaddubsw m3, m1, m6
29069pmulhrsw m3, m7
29070pmaddubsw m5, m4, m6
29071pmulhrsw m5, m7
29072packuswb m3, m5
29073movu [r0 + 1655 * 16], m3
29074
29075; mode 27 [row 28]
29076movu m6, [r5 + 26 * 16]
29077pmaddubsw m3, m0, m6
29078pmulhrsw m3, m7
29079pmaddubsw m5, m2, m6
29080pmulhrsw m5, m7
29081packuswb m3, m5
29082movu [r0 + 1656 * 16], m3
29083pmaddubsw m3, m1, m6
29084pmulhrsw m3, m7
29085pmaddubsw m5, m4, m6
29086pmulhrsw m5, m7
29087packuswb m3, m5
29088movu [r0 + 1657 * 16], m3
29089
29090; mode 27 [row 29]
29091movu m6, [r5 + 28 * 16]
29092pmaddubsw m3, m0, m6
29093pmulhrsw m3, m7
29094pmaddubsw m5, m2, m6
29095pmulhrsw m5, m7
29096packuswb m3, m5
29097movu [r0 + 1658 * 16], m3
29098
29099; mode 28 [row 11 - first half]
29100movu [r0 + 1686 * 16], m3
29101
29102pmaddubsw m3, m1, m6
29103pmulhrsw m3, m7
29104pmaddubsw m5, m4, m6
29105pmulhrsw m5, m7
29106packuswb m3, m5
29107movu [r0 + 1659 * 16], m3
29108
29109; mode 28 [row 11 - second half]
29110movu [r0 + 1687 * 16], m3
29111
29112; mode 27 [row 30]
29113movu m6, [r5 + 30 * 16]
29114pmaddubsw m3, m0, m6
29115pmulhrsw m3, m7
29116pmaddubsw m5, m2, m6
29117pmulhrsw m5, m7
29118packuswb m3, m5
29119movu [r0 + 1660 * 16], m3
29120pmaddubsw m3, m1, m6
29121pmulhrsw m3, m7
29122pmaddubsw m5, m4, m6
29123pmulhrsw m5, m7
29124packuswb m3, m5
29125movu [r0 + 1661 * 16], m3
29126
29127; mode 28 [row 6]
29128movu m6, [r5 + 3 * 16]
29129pmaddubsw m3, m0, m6
29130pmulhrsw m3, m7
29131pmaddubsw m5, m2, m6
29132pmulhrsw m5, m7
29133packuswb m3, m5
29134movu [r0 + 1676 * 16], m3
29135pmaddubsw m3, m1, m6
29136pmulhrsw m3, m7
29137pmaddubsw m5, m4, m6
29138pmulhrsw m5, m7
29139packuswb m3, m5
29140movu [r0 + 1677 * 16], m3
29141
29142; mode 28 [row 8]
29143movu m6, [r5 + 13 * 16]
29144pmaddubsw m3, m0, m6
29145pmulhrsw m3, m7
29146pmaddubsw m5, m2, m6
29147pmulhrsw m5, m7
29148packuswb m3, m5
29149movu [r0 + 1680 * 16], m3
29150
29151; mode 29 [row 4 - first half]
29152movu [r0 + 1736 * 16], m3
29153
29154pmaddubsw m3, m1, m6
29155pmulhrsw m3, m7
29156pmaddubsw m5, m4, m6
29157pmulhrsw m5, m7
29158packuswb m3, m5
29159movu [r0 + 1681 * 16], m3
29160
29161; mode 29 [row 4 - second half]
29162movu [r0 + 1737 * 16], m3
29163
29164; mode 28 [row 10]
29165movu m6, [r5 + 23 * 16]
29166pmaddubsw m3, m0, m6
29167pmulhrsw m3, m7
29168pmaddubsw m5, m2, m6
29169pmulhrsw m5, m7
29170packuswb m3, m5
29171movu [r0 + 1684 * 16], m3
29172pmaddubsw m3, m1, m6
29173pmulhrsw m3, m7
29174pmaddubsw m5, m4, m6
29175pmulhrsw m5, m7
29176packuswb m3, m5
29177movu [r0 + 1685 * 16], m3
29178
29179; mode 29 [row 6]
29180movu m6, [r5 + 31 * 16]
29181pmaddubsw m3, m0, m6
29182pmulhrsw m3, m7
29183pmaddubsw m5, m2, m6
29184pmulhrsw m5, m7
29185packuswb m3, m5
29186movu [r0 + 1740 * 16], m3
29187
29188; mode 32 [row 2 - first half]
29189movu [r0 + 1924 * 16], m3
29190
29191pmaddubsw m3, m1, m6
29192pmulhrsw m3, m7
29193pmaddubsw m5, m4, m6
29194pmulhrsw m5, m7
29195packuswb m3, m5
29196movu [r0 + 1741 * 16], m3
29197
29198; mode 32 [row 2 - second half]
29199movu [r0 + 1925 * 16], m3
29200
29201; mode 30 [row 2]
29202movu m6, [r5 + 7 * 16]
29203pmaddubsw m3, m0, m6
29204pmulhrsw m3, m7
29205pmaddubsw m5, m2, m6
29206pmulhrsw m5, m7
29207packuswb m3, m5
29208movu [r0 + 1796 * 16], m3
29209pmaddubsw m3, m1, m6
29210pmulhrsw m3, m7
29211pmaddubsw m5, m4, m6
29212pmulhrsw m5, m7
29213packuswb m3, m5
29214movu [r0 + 1797 * 16], m3
29215
29216; mode 31 [row 2]
29217movu m6, [r5 + 19 * 16]
29218pmaddubsw m3, m0, m6
29219pmulhrsw m3, m7
29220pmaddubsw m5, m2, m6
29221pmulhrsw m5, m7
29222packuswb m3, m5
29223movu [r0 + 1860 * 16], m3
29224pmaddubsw m3, m1, m6
29225pmulhrsw m3, m7
29226pmaddubsw m5, m4, m6
29227pmulhrsw m5, m7
29228packuswb m3, m5
29229movu [r0 + 1861 * 16], m3
29230
29231; mode 27 [row 15]
29232movu m0, [r3 + 3]
29233movd m1, [r3 + 4]
29234palignr m1, m0, 1
29235punpcklbw m0, m1
29236movu m2, [r3 + 11]
29237movd m3, [r3 + 12]
29238palignr m3, m2, 1
29239punpcklbw m2, m3
29240movu m1, [r3 + 19]
29241movd m3, [r3 + 20]
29242palignr m3, m1, 1
29243punpcklbw m1, m3
29244movu m4, [r3 + 27]
29245movd m5, [r3 + 28]
29246palignr m5, m4, 1
29247punpcklbw m4, m5
29248
29249pshufb m5, m0, [tab_S2]
29250movh [r0 + 1662 * 16], m5
29251pshufb m5, m2, [tab_S2]
29252movh [r0 + 1662 * 16 + 8], m5
29253pshufb m5, m1, [tab_S2]
29254movh [r0 + 1663 * 16], m5
29255pshufb m5, m4, [tab_S2]
29256movh [r0 + 1663 * 16 + 8], m5
29257
29258; mode 28 [row 12]
29259movu m6, [r5 + 1 * 16]
29260pmaddubsw m3, m0, m6
29261pmulhrsw m3, m7
29262pmaddubsw m5, m2, m6
29263pmulhrsw m5, m7
29264packuswb m3, m5
29265movu [r0 + 1688 * 16], m3
29266
29267; mode 30 [row 4 - first half]
29268movu [r0 + 1800 * 16], m3
29269
29270pmaddubsw m3, m1, m6
29271pmulhrsw m3, m7
29272pmaddubsw m5, m4, m6
29273pmulhrsw m5, m7
29274packuswb m3, m5
29275movu [r0 + 1689 * 16], m3
29276
29277; mode 30 [row 4 - second half]
29278movu [r0 + 1801 * 16], m3
29279
29280; mode 28 [row 13]
29281movu m6, [r5 + 6 * 16]
29282pmaddubsw m3, m0, m6
29283pmulhrsw m3, m7
29284pmaddubsw m5, m2, m6
29285pmulhrsw m5, m7
29286packuswb m3, m5
29287movu [r0 + 1690 * 16], m3
29288pmaddubsw m3, m1, m6
29289pmulhrsw m3, m7
29290pmaddubsw m5, m4, m6
29291pmulhrsw m5, m7
29292packuswb m3, m5
29293movu [r0 + 1691 * 16], m3
29294
29295; mode 28 [row 14]
29296movu m6, [r5 + 11 * 16]
29297pmaddubsw m3, m0, m6
29298pmulhrsw m3, m7
29299pmaddubsw m5, m2, m6
29300pmulhrsw m5, m7
29301packuswb m3, m5
29302movu [r0 + 1692 * 16], m3
29303pmaddubsw m3, m1, m6
29304pmulhrsw m3, m7
29305pmaddubsw m5, m4, m6
29306pmulhrsw m5, m7
29307packuswb m3, m5
29308movu [r0 + 1693 * 16], m3
29309
29310; mode 28 [row 15]
29311movu m6, [r5 + 16 * 16]
29312pmaddubsw m3, m0, m6
29313pmulhrsw m3, m7
29314pmaddubsw m5, m2, m6
29315pmulhrsw m5, m7
29316packuswb m3, m5
29317movu [r0 + 1694 * 16], m3
29318pmaddubsw m3, m1, m6
29319pmulhrsw m3, m7
29320pmaddubsw m5, m4, m6
29321pmulhrsw m5, m7
29322packuswb m3, m5
29323movu [r0 + 1695 * 16], m3
29324
29325; mode 28 [row 16]
29326movu m6, [r5 + 21 * 16]
29327pmaddubsw m3, m0, m6
29328pmulhrsw m3, m7
29329pmaddubsw m5, m2, m6
29330pmulhrsw m5, m7
29331packuswb m3, m5
29332movu [r0 + 1696 * 16], m3
29333
29334; mode 31 [row 4 - first half]
29335movu [r0 + 1864 * 16], m3
29336
29337pmaddubsw m3, m1, m6
29338pmulhrsw m3, m7
29339pmaddubsw m5, m4, m6
29340pmulhrsw m5, m7
29341packuswb m3, m5
29342movu [r0 + 1697 * 16], m3
29343
29344; mode 31 [row 4 - second half]
29345movu [r0 + 1865 * 16], m3
29346
29347; mode 28 [row 17]
29348movu m6, [r5 + 26 * 16]
29349pmaddubsw m3, m0, m6
29350pmulhrsw m3, m7
29351pmaddubsw m5, m2, m6
29352pmulhrsw m5, m7
29353packuswb m3, m5
29354movu [r0 + 1698 * 16], m3
29355
29356; mode 29 [row 9 - first half]
29357movu [r0 + 1746 * 16], m3
29358
29359pmaddubsw m3, m1, m6
29360pmulhrsw m3, m7
29361pmaddubsw m5, m4, m6
29362pmulhrsw m5, m7
29363packuswb m3, m5
29364movu [r0 + 1699 * 16], m3
29365
29366; mode 29 [row 9 - second half]
29367movu [r0 + 1747 * 16], m3
29368
29369; mode 28 [row 18]
29370movu m6, [r5 + 31 * 16]
29371pmaddubsw m3, m0, m6
29372pmulhrsw m3, m7
29373pmaddubsw m5, m2, m6
29374pmulhrsw m5, m7
29375packuswb m3, m5
29376movu [r0 + 1700 * 16], m3
29377pmaddubsw m3, m1, m6
29378pmulhrsw m3, m7
29379pmaddubsw m5, m4, m6
29380pmulhrsw m5, m7
29381packuswb m3, m5
29382movu [r0 + 1701 * 16], m3
29383
29384; mode 29 [row 7]
29385movu m6, [r5 + 8 * 16]
29386pmaddubsw m3, m0, m6
29387pmulhrsw m3, m7
29388pmaddubsw m5, m2, m6
29389pmulhrsw m5, m7
29390packuswb m3, m5
29391movu [r0 + 1742 * 16], m3
29392pmaddubsw m3, m1, m6
29393pmulhrsw m3, m7
29394pmaddubsw m5, m4, m6
29395pmulhrsw m5, m7
29396packuswb m3, m5
29397movu [r0 + 1743 * 16], m3
29398
29399; mode 29 [row 8]
29400movu m6, [r5 + 17 * 16]
29401pmaddubsw m3, m0, m6
29402pmulhrsw m3, m7
29403pmaddubsw m5, m2, m6
29404pmulhrsw m5, m7
29405packuswb m3, m5
29406movu [r0 + 1744 * 16], m3
29407pmaddubsw m3, m1, m6
29408pmulhrsw m3, m7
29409pmaddubsw m5, m4, m6
29410pmulhrsw m5, m7
29411packuswb m3, m5
29412movu [r0 + 1745 * 16], m3
29413
29414; mode 30 [row 5]
29415movu m6, [r5 + 14 * 16]
29416pmaddubsw m3, m0, m6
29417pmulhrsw m3, m7
29418pmaddubsw m5, m2, m6
29419pmulhrsw m5, m7
29420packuswb m3, m5
29421movu [r0 + 1802 * 16], m3
29422
29423; mode 33 [row 2 - first half]
29424movu [r0 + 1988 * 16], m3
29425
29426pmaddubsw m3, m1, m6
29427pmulhrsw m3, m7
29428pmaddubsw m5, m4, m6
29429pmulhrsw m5, m7
29430packuswb m3, m5
29431movu [r0 + 1803 * 16], m3
29432
29433; mode 33 [row 2 - second half]
29434movu [r0 + 1989 * 16], m3
29435
29436; mode 30 [row 6]
29437movu m6, [r5 + 27 * 16]
29438pmaddubsw m3, m0, m6
29439pmulhrsw m3, m7
29440pmaddubsw m5, m2, m6
29441pmulhrsw m5, m7
29442packuswb m3, m5
29443movu [r0 + 1804 * 16], m3
29444pmaddubsw m3, m1, m6
29445pmulhrsw m3, m7
29446pmaddubsw m5, m4, m6
29447pmulhrsw m5, m7
29448packuswb m3, m5
29449movu [r0 + 1805 * 16], m3
29450
29451; mode 31 [row 3]
29452movu m6, [r5 + 4 * 16]
29453pmaddubsw m3, m0, m6
29454pmulhrsw m3, m7
29455pmaddubsw m5, m2, m6
29456pmulhrsw m5, m7
29457packuswb m3, m5
29458movu [r0 + 1862 * 16], m3
29459pmaddubsw m3, m1, m6
29460pmulhrsw m3, m7
29461pmaddubsw m5, m4, m6
29462pmulhrsw m5, m7
29463packuswb m3, m5
29464movu [r0 + 1863 * 16], m3
29465
29466; mode 32 [row 3]
29467movu m6, [r5 + 20 * 16]
29468pmaddubsw m3, m0, m6
29469pmulhrsw m3, m7
29470pmaddubsw m5, m2, m6
29471pmulhrsw m5, m7
29472packuswb m3, m5
29473movu [r0 + 1926 * 16], m3
29474pmaddubsw m3, m1, m6
29475pmulhrsw m3, m7
29476pmaddubsw m5, m4, m6
29477pmulhrsw m5, m7
29478packuswb m3, m5
29479movu [r0 + 1927 * 16], m3
29480
29481; mode 28 [row 19]
29482movu m6, [r5 + 4 * 16]
29483movu m0, [r3 + 4]
29484movd m1, [r3 + 5]
29485palignr m1, m0, 1
29486punpcklbw m0, m1
29487pmaddubsw m3, m0, m6
29488pmulhrsw m3, m7
29489movu m2, [r3 + 12]
29490movd m4, [r3 + 13]
29491palignr m4, m2, 1
29492punpcklbw m2, m4
29493pmaddubsw m5, m2, m6
29494pmulhrsw m5, m7
29495packuswb m3, m5
29496movu [r0 + 1702 * 16], m3
29497
29498movu m1, [r3 + 20]
29499movd m3, [r3 + 21]
29500palignr m3, m1, 1
29501punpcklbw m1, m3
29502pmaddubsw m3, m1, m6
29503pmulhrsw m3, m7
29504movu m4, [r3 + 28]
29505movd m5, [r3 + 29]
29506palignr m5, m4, 1
29507punpcklbw m4, m5
29508pmaddubsw m5, m4, m6
29509pmulhrsw m5, m7
29510packuswb m3, m5
29511movu [r0 + 1703 * 16], m3
29512
29513; mode 28 [row 20]
29514movu m6, [r5 + 9 * 16]
29515pmaddubsw m3, m0, m6
29516pmulhrsw m3, m7
29517pmaddubsw m5, m2, m6
29518pmulhrsw m5, m7
29519packuswb m3, m5
29520movu [r0 + 1704 * 16], m3
29521
29522; mode 32 [row 4 - first half]
29523movu [r0 + 1928 * 16], m3
29524
29525pmaddubsw m3, m1, m6
29526pmulhrsw m3, m7
29527pmaddubsw m5, m4, m6
29528pmulhrsw m5, m7
29529packuswb m3, m5
29530movu [r0 + 1705 * 16], m3
29531
29532; mode 32 [row 4 - second half]
29533movu [r0 + 1929 * 16], m3
29534
29535; mode 28 [row 21]
29536movu m6, [r5 + 14 * 16]
29537pmaddubsw m3, m0, m6
29538pmulhrsw m3, m7
29539pmaddubsw m5, m2, m6
29540pmulhrsw m5, m7
29541packuswb m3, m5
29542movu [r0 + 1706 * 16], m3
29543pmaddubsw m3, m1, m6
29544pmulhrsw m3, m7
29545pmaddubsw m5, m4, m6
29546pmulhrsw m5, m7
29547packuswb m3, m5
29548movu [r0 + 1707 * 16], m3
29549
29550; mode 28 [row 22]
29551movu m6, [r5 + 19 * 16]
29552pmaddubsw m3, m0, m6
29553pmulhrsw m3, m7
29554pmaddubsw m5, m2, m6
29555pmulhrsw m5, m7
29556packuswb m3, m5
29557movu [r0 + 1708 * 16], m3
29558pmaddubsw m3, m1, m6
29559pmulhrsw m3, m7
29560pmaddubsw m5, m4, m6
29561pmulhrsw m5, m7
29562packuswb m3, m5
29563movu [r0 + 1709 * 16], m3
29564
29565; mode 28 [row 23]
29566movu m6, [r5 + 24 * 16]
29567pmaddubsw m3, m0, m6
29568pmulhrsw m3, m7
29569pmaddubsw m5, m2, m6
29570pmulhrsw m5, m7
29571packuswb m3, m5
29572movu [r0 + 1710 * 16], m3
29573pmaddubsw m3, m1, m6
29574pmulhrsw m3, m7
29575pmaddubsw m5, m4, m6
29576pmulhrsw m5, m7
29577packuswb m3, m5
29578movu [r0 + 1711 * 16], m3
29579
29580; mode 28 [row 24]
29581movu m6, [r5 + 29 * 16]
29582pmaddubsw m3, m0, m6
29583pmulhrsw m3, m7
29584pmaddubsw m5, m2, m6
29585pmulhrsw m5, m7
29586packuswb m3, m5
29587movu [r0 + 1712 * 16], m3
29588pmaddubsw m3, m1, m6
29589pmulhrsw m3, m7
29590pmaddubsw m5, m4, m6
29591pmulhrsw m5, m7
29592packuswb m3, m5
29593movu [r0 + 1713 * 16], m3
29594
29595; mode 29 [row 10]
29596movu m6, [r5 + 3 * 16]
29597pmaddubsw m3, m0, m6
29598pmulhrsw m3, m7
29599pmaddubsw m5, m2, m6
29600pmulhrsw m5, m7
29601packuswb m3, m5
29602movu [r0 + 1748 * 16], m3
29603pmaddubsw m3, m1, m6
29604pmulhrsw m3, m7
29605pmaddubsw m5, m4, m6
29606pmulhrsw m5, m7
29607packuswb m3, m5
29608movu [r0 + 1749 * 16], m3
29609
29610; mode 29 [row 11]
29611movu m6, [r5 + 12 * 16]
29612pmaddubsw m3, m0, m6
29613pmulhrsw m3, m7
29614pmaddubsw m5, m2, m6
29615pmulhrsw m5, m7
29616packuswb m3, m5
29617movu [r0 + 1750 * 16], m3
29618pmaddubsw m3, m1, m6
29619pmulhrsw m3, m7
29620pmaddubsw m5, m4, m6
29621pmulhrsw m5, m7
29622packuswb m3, m5
29623movu [r0 + 1751 * 16], m3
29624
29625; mode 29 [row 12]
29626movu m6, [r5 + 21 * 16]
29627pmaddubsw m3, m0, m6
29628pmulhrsw m3, m7
29629pmaddubsw m5, m2, m6
29630pmulhrsw m5, m7
29631packuswb m3, m5
29632movu [r0 + 1752 * 16], m3
29633
29634; mode 30 [row 8 -first half]
29635movu [r0 + 1808 * 16], m3
29636
29637pmaddubsw m3, m1, m6
29638pmulhrsw m3, m7
29639pmaddubsw m5, m4, m6
29640pmulhrsw m5, m7
29641packuswb m3, m5
29642movu [r0 + 1753 * 16], m3
29643
29644; mode 30 [row 8 -second half]
29645movu [r0 + 1809 * 16], m3
29646
29647; mode 29 [row 13]
29648movu m6, [r5 + 30 * 16]
29649pmaddubsw m3, m0, m6
29650pmulhrsw m3, m7
29651pmaddubsw m5, m2, m6
29652pmulhrsw m5, m7
29653packuswb m3, m5
29654movu [r0 + 1754 * 16], m3
29655
29656; mode 32 [row 5 - first half]
29657movu [r0 + 1930 * 16], m3
29658
29659pmaddubsw m3, m1, m6
29660pmulhrsw m3, m7
29661pmaddubsw m5, m4, m6
29662pmulhrsw m5, m7
29663packuswb m3, m5
29664movu [r0 + 1755 * 16], m3
29665
29666; mode 32 [row 5 - second half]
29667movu [r0 + 1931 * 16], m3
29668
29669; mode 30 [row 7]
29670movu m6, [r5 + 8 * 16]
29671pmaddubsw m3, m0, m6
29672pmulhrsw m3, m7
29673pmaddubsw m5, m2, m6
29674pmulhrsw m5, m7
29675packuswb m3, m5
29676movu [r0 + 1806 * 16], m3
29677
29678; mode 33 [row 3 - first half]
29679movu [r0 + 1990 * 16], m3
29680
29681pmaddubsw m3, m1, m6
29682pmulhrsw m3, m7
29683pmaddubsw m5, m4, m6
29684pmulhrsw m5, m7
29685packuswb m3, m5
29686movu [r0 + 1807 * 16], m3
29687
29688; mode 33 [row 3 - second half]
29689movu [r0 + 1991 * 16], m3
29690
29691; mode 31 [row 5]
29692movu m6, [r5 + 6 * 16]
29693pmaddubsw m3, m0, m6
29694pmulhrsw m3, m7
29695pmaddubsw m5, m2, m6
29696pmulhrsw m5, m7
29697packuswb m3, m5
29698movu [r0 + 1866 * 16], m3
29699pmaddubsw m3, m1, m6
29700pmulhrsw m3, m7
29701pmaddubsw m5, m4, m6
29702pmulhrsw m5, m7
29703packuswb m3, m5
29704movu [r0 + 1867 * 16], m3
29705
29706; mode 31 [row 6]
29707movu m6, [r5 + 23 * 16]
29708pmaddubsw m3, m0, m6
29709pmulhrsw m3, m7
29710pmaddubsw m5, m2, m6
29711pmulhrsw m5, m7
29712packuswb m3, m5
29713movu [r0 + 1868 * 16], m3
29714pmaddubsw m3, m1, m6
29715pmulhrsw m3, m7
29716pmaddubsw m5, m4, m6
29717pmulhrsw m5, m7
29718packuswb m3, m5
29719movu [r0 + 1869 * 16], m3
29720
29721; mode 28 [row 25]
29722movu m6, [r5 + 2 * 16]
29723movu m0, [r3 + 5]
29724movd m1, [r3 + 6]
29725palignr m1, m0, 1
29726punpcklbw m0, m1
29727pmaddubsw m3, m0, m6
29728pmulhrsw m3, m7
29729movu m2, [r3 + 13]
29730movd m4, [r3 + 14]
29731palignr m4, m2, 1
29732punpcklbw m2, m4
29733pmaddubsw m5, m2, m6
29734pmulhrsw m5, m7
29735packuswb m3, m5
29736movu [r0 + 1714 * 16], m3
29737
29738movu m1, [r3 + 21]
29739movd m3, [r3 + 22]
29740palignr m3, m1, 1
29741punpcklbw m1, m3
29742pmaddubsw m3, m1, m6
29743pmulhrsw m3, m7
29744movu m4, [r3 + 29]
29745movd m5, [r3 + 30]
29746palignr m5, m4, 1
29747punpcklbw m4, m5
29748pmaddubsw m5, m4, m6
29749pmulhrsw m5, m7
29750packuswb m3, m5
29751movu [r0 + 1715 * 16], m3
29752
29753; mode 28 [row 26]
29754movu m6, [r5 + 7 * 16]
29755pmaddubsw m3, m0, m6
29756pmulhrsw m3, m7
29757pmaddubsw m5, m2, m6
29758pmulhrsw m5, m7
29759packuswb m3, m5
29760movu [r0 + 1716 * 16], m3
29761
29762; mode 29 [row 14 - first half]
29763movu [r0 + 1756 * 16], m3
29764
29765pmaddubsw m3, m1, m6
29766pmulhrsw m3, m7
29767pmaddubsw m5, m4, m6
29768pmulhrsw m5, m7
29769packuswb m3, m5
29770movu [r0 + 1717 * 16], m3
29771
29772; mode 29 [row 14 - second half]
29773movu [r0 + 1757 * 16], m3
29774
29775; mode 28 [row 27]
29776movu m6, [r5 + 12 * 16]
29777pmaddubsw m3, m0, m6
29778pmulhrsw m3, m7
29779pmaddubsw m5, m2, m6
29780pmulhrsw m5, m7
29781packuswb m3, m5
29782movu [r0 + 1718 * 16], m3
29783pmaddubsw m3, m1, m6
29784pmulhrsw m3, m7
29785pmaddubsw m5, m4, m6
29786pmulhrsw m5, m7
29787packuswb m3, m5
29788movu [r0 + 1719 * 16], m3
29789
29790; mode 28 [row 28]
29791movu m6, [r5 + 17 * 16]
29792pmaddubsw m3, m0, m6
29793pmulhrsw m3, m7
29794pmaddubsw m5, m2, m6
29795pmulhrsw m5, m7
29796packuswb m3, m5
29797movu [r0 + 1720 * 16], m3
29798pmaddubsw m3, m1, m6
29799pmulhrsw m3, m7
29800pmaddubsw m5, m4, m6
29801pmulhrsw m5, m7
29802packuswb m3, m5
29803movu [r0 + 1721 * 16], m3
29804
29805; mode 28 [row 29]
29806movu m6, [r5 + 22 * 16]
29807pmaddubsw m3, m0, m6
29808pmulhrsw m3, m7
29809pmaddubsw m5, m2, m6
29810pmulhrsw m5, m7
29811packuswb m3, m5
29812movu [r0 + 1722 * 16], m3
29813pmaddubsw m3, m1, m6
29814pmulhrsw m3, m7
29815pmaddubsw m5, m4, m6
29816pmulhrsw m5, m7
29817packuswb m3, m5
29818movu [r0 + 1723 * 16], m3
29819
29820; mode 28 [row 30]
29821movu m6, [r5 + 27 * 16]
29822pmaddubsw m3, m0, m6
29823pmulhrsw m3, m7
29824pmaddubsw m5, m2, m6
29825pmulhrsw m5, m7
29826packuswb m3, m5
29827movu [r0 + 1724 * 16], m3
29828pmaddubsw m3, m1, m6
29829pmulhrsw m3, m7
29830pmaddubsw m5, m4, m6
29831pmulhrsw m5, m7
29832packuswb m3, m5
29833movu [r0 + 1725 * 16], m3
29834
29835; mode 29 [row 15]
29836movu m6, [r5 + 16 * 16]
29837pmaddubsw m3, m0, m6
29838pmulhrsw m3, m7
29839pmaddubsw m5, m2, m6
29840pmulhrsw m5, m7
29841packuswb m3, m5
29842movu [r0 + 1758 * 16], m3
29843pmaddubsw m3, m1, m6
29844pmulhrsw m3, m7
29845pmaddubsw m5, m4, m6
29846pmulhrsw m5, m7
29847packuswb m3, m5
29848movu [r0 + 1759 * 16], m3
29849
29850; mode 29 [row 16]
29851movu m6, [r5 + 25 * 16]
29852pmaddubsw m3, m0, m6
29853pmulhrsw m3, m7
29854pmaddubsw m5, m2, m6
29855pmulhrsw m5, m7
29856packuswb m3, m5
29857movu [r0 + 1760 * 16], m3
29858pmaddubsw m3, m1, m6
29859pmulhrsw m3, m7
29860pmaddubsw m5, m4, m6
29861pmulhrsw m5, m7
29862packuswb m3, m5
29863movu [r0 + 1761 * 16], m3
29864
29865; mode 30 [row 9]
29866movu m6, [r5 + 2 * 16]
29867pmaddubsw m3, m0, m6
29868pmulhrsw m3, m7
29869pmaddubsw m5, m2, m6
29870pmulhrsw m5, m7
29871packuswb m3, m5
29872movu [r0 + 1810 * 16], m3
29873
29874; mode 33 [row 4 - first half]
29875movu [r0 + 1992 * 16], m3
29876
29877pmaddubsw m3, m1, m6
29878pmulhrsw m3, m7
29879pmaddubsw m5, m4, m6
29880pmulhrsw m5, m7
29881packuswb m3, m5
29882movu [r0 + 1811 * 16], m3
29883
29884; mode 33 [row 4 - second half]
29885movu [r0 + 1993 * 16], m3
29886
29887; mode 30 [row 10]
29888movu m6, [r5 + 15 * 16]
29889pmaddubsw m3, m0, m6
29890pmulhrsw m3, m7
29891pmaddubsw m5, m2, m6
29892pmulhrsw m5, m7
29893packuswb m3, m5
29894movu [r0 + 1812 * 16], m3
29895pmaddubsw m3, m1, m6
29896pmulhrsw m3, m7
29897pmaddubsw m5, m4, m6
29898pmulhrsw m5, m7
29899packuswb m3, m5
29900movu [r0 + 1813 * 16], m3
29901
29902; mode 31 [row 7]
29903movu m6, [r5 + 8 * 16]
29904pmaddubsw m3, m0, m6
29905pmulhrsw m3, m7
29906pmaddubsw m5, m2, m6
29907pmulhrsw m5, m7
29908packuswb m3, m5
29909movu [r0 + 1870 * 16], m3
29910pmaddubsw m3, m1, m6
29911pmulhrsw m3, m7
29912pmaddubsw m5, m4, m6
29913pmulhrsw m5, m7
29914packuswb m3, m5
29915movu [r0 + 1871 * 16], m3
29916
29917; mode 31 [row 8]
29918movu m6, [r5 + 25 * 16]
29919pmaddubsw m3, m0, m6
29920pmulhrsw m3, m7
29921pmaddubsw m5, m2, m6
29922pmulhrsw m5, m7
29923packuswb m3, m5
29924movu [r0 + 1872 * 16], m3
29925pmaddubsw m3, m1, m6
29926pmulhrsw m3, m7
29927pmaddubsw m5, m4, m6
29928pmulhrsw m5, m7
29929packuswb m3, m5
29930movu [r0 + 1873 * 16], m3
29931
29932; mode 32 [row 6]
29933movu m6, [r5 + 19 * 16]
29934pmaddubsw m3, m0, m6
29935pmulhrsw m3, m7
29936pmaddubsw m5, m2, m6
29937pmulhrsw m5, m7
29938packuswb m3, m5
29939movu [r0 + 1932 * 16], m3
29940pmaddubsw m3, m1, m6
29941pmulhrsw m3, m7
29942pmaddubsw m5, m4, m6
29943pmulhrsw m5, m7
29944packuswb m3, m5
29945movu [r0 + 1933 * 16], m3
29946
29947; mode 30 [row 11]
29948movu m6, [r5 + 28 * 16]
29949pmaddubsw m3, m0, m6
29950pmulhrsw m3, m7
29951pmaddubsw m5, m2, m6
29952pmulhrsw m5, m7
29953packuswb m3, m5
29954movu [r0 + 1814 * 16], m3
29955
29956; mode 33 [row 5 - first half]
29957movu [r0 + 1994 * 16], m3
29958
29959pmaddubsw m3, m1, m6
29960pmulhrsw m3, m7
29961pmaddubsw m5, m4, m6
29962pmulhrsw m5, m7
29963packuswb m3, m5
29964movu [r0 + 1815 * 16], m3
29965
29966; mode 33 [row 5 - second half]
29967movu [r0 + 1995 * 16], m3
29968
29969; mode 28 [row 31]
29970movu m0, [r3 + 6]
29971movd m1, [r3 + 7]
29972palignr m1, m0, 1
29973punpcklbw m0, m1
29974movu m2, [r3 + 14]
29975movd m3, [r3 + 15]
29976palignr m3, m2, 1
29977punpcklbw m2, m3
29978movu m1, [r3 + 22]
29979movd m3, [r3 + 23]
29980palignr m3, m1, 1
29981punpcklbw m1, m3
29982movu m4, [r3 + 30]
29983movd m5, [r3 + 31]
29984palignr m5, m4, 1
29985punpcklbw m4, m5
29986
29987pshufb m5, m0, [tab_S2]
29988movh [r0 + 1726 * 16], m5
29989pshufb m5, m2, [tab_S2]
29990movh [r0 + 1726 * 16 + 8], m5
29991pshufb m5, m1, [tab_S2]
29992movh [r0 + 1727 * 16], m5
29993pshufb m5, m4, [tab_S2]
29994movh [r0 + 1727 * 16 + 8], m5
29995
29996; mode 29 [row 17]
29997movu m6, [r5 + 2 * 16]
29998pmaddubsw m3, m0, m6
29999pmulhrsw m3, m7
30000pmaddubsw m5, m2, m6
30001pmulhrsw m5, m7
30002packuswb m3, m5
30003movu [r0 + 1762 * 16], m3
30004pmaddubsw m3, m1, m6
30005pmulhrsw m3, m7
30006pmaddubsw m5, m4, m6
30007pmulhrsw m5, m7
30008packuswb m3, m5
30009movu [r0 + 1763 * 16], m3
30010
30011; mode 29 [row 18]
30012movu m6, [r5 + 11 * 16]
30013pmaddubsw m3, m0, m6
30014pmulhrsw m3, m7
30015pmaddubsw m5, m2, m6
30016pmulhrsw m5, m7
30017packuswb m3, m5
30018movu [r0 + 1764 * 16], m3
30019pmaddubsw m3, m1, m6
30020pmulhrsw m3, m7
30021pmaddubsw m5, m4, m6
30022pmulhrsw m5, m7
30023packuswb m3, m5
30024movu [r0 + 1765 * 16], m3
30025
30026; mode 29 [row 19]
30027movu m6, [r5 + 20 * 16]
30028pmaddubsw m3, m0, m6
30029pmulhrsw m3, m7
30030pmaddubsw m5, m2, m6
30031pmulhrsw m5, m7
30032packuswb m3, m5
30033movu [r0 + 1766 * 16], m3
30034pmaddubsw m3, m1, m6
30035pmulhrsw m3, m7
30036pmaddubsw m5, m4, m6
30037pmulhrsw m5, m7
30038packuswb m3, m5
30039movu [r0 + 1767 * 16], m3
30040
30041; mode 29 [row 20]
30042movu m6, [r5 + 29 * 16]
30043pmaddubsw m3, m0, m6
30044pmulhrsw m3, m7
30045pmaddubsw m5, m2, m6
30046pmulhrsw m5, m7
30047packuswb m3, m5
30048movu [r0 + 1768 * 16], m3
30049
30050; mode 32 [row 8 - first halif]
30051movu [r0 + 1936 * 16], m3
30052
30053pmaddubsw m3, m1, m6
30054pmulhrsw m3, m7
30055pmaddubsw m5, m4, m6
30056pmulhrsw m5, m7
30057packuswb m3, m5
30058movu [r0 + 1769 * 16], m3
30059
30060; mode 32 [row 8 - second halif]
30061movu [r0 + 1937 * 16], m3
30062
30063; mode 30 [row 12]
30064movu m6, [r5 + 9 * 16]
30065pmaddubsw m3, m0, m6
30066pmulhrsw m3, m7
30067pmaddubsw m5, m2, m6
30068pmulhrsw m5, m7
30069packuswb m3, m5
30070movu [r0 + 1816 * 16], m3
30071pmaddubsw m3, m1, m6
30072pmulhrsw m3, m7
30073pmaddubsw m5, m4, m6
30074pmulhrsw m5, m7
30075packuswb m3, m5
30076movu [r0 + 1817 * 16], m3
30077
30078; mode 30 [row 13]
30079movu m6, [r5 + 22 * 16]
30080pmaddubsw m3, m0, m6
30081pmulhrsw m3, m7
30082pmaddubsw m5, m2, m6
30083pmulhrsw m5, m7
30084packuswb m3, m5
30085movu [r0 + 1818 * 16], m3
30086
30087; mode 33 [row 6 - first half]
30088movu [r0 + 1996 * 16], m3
30089
30090pmaddubsw m3, m1, m6
30091pmulhrsw m3, m7
30092pmaddubsw m5, m4, m6
30093pmulhrsw m5, m7
30094packuswb m3, m5
30095movu [r0 + 1819 * 16], m3
30096
30097; mode 33 [row 6 - second half]
30098movu [r0 + 1997 * 16], m3
30099
30100; mode 31 [row 9]
30101movu m6, [r5 + 10 * 16]
30102pmaddubsw m3, m0, m6
30103pmulhrsw m3, m7
30104pmaddubsw m5, m2, m6
30105pmulhrsw m5, m7
30106packuswb m3, m5
30107movu [r0 + 1874 * 16], m3
30108pmaddubsw m3, m1, m6
30109pmulhrsw m3, m7
30110pmaddubsw m5, m4, m6
30111pmulhrsw m5, m7
30112packuswb m3, m5
30113movu [r0 + 1875 * 16], m3
30114
30115; mode 31 [row 10]
30116movu m6, [r5 + 27 * 16]
30117pmaddubsw m3, m0, m6
30118pmulhrsw m3, m7
30119pmaddubsw m5, m2, m6
30120pmulhrsw m5, m7
30121packuswb m3, m5
30122movu [r0 + 1876 * 16], m3
30123pmaddubsw m3, m1, m6
30124pmulhrsw m3, m7
30125pmaddubsw m5, m4, m6
30126pmulhrsw m5, m7
30127packuswb m3, m5
30128movu [r0 + 1877 * 16], m3
30129
30130; mode 32 [row 7]
30131movu m6, [r5 + 8 * 16]
30132pmaddubsw m3, m0, m6
30133pmulhrsw m3, m7
30134pmaddubsw m5, m2, m6
30135pmulhrsw m5, m7
30136packuswb m3, m5
30137movu [r0 + 1934 * 16], m3
30138pmaddubsw m3, m1, m6
30139pmulhrsw m3, m7
30140pmaddubsw m5, m4, m6
30141pmulhrsw m5, m7
30142packuswb m3, m5
30143movu [r0 + 1935 * 16], m3
30144
30145; mode 29 [row 21]
30146movu m6, [r5 + 6 * 16]
30147movu m0, [r3 + 7]
30148movd m1, [r3 + 8]
30149palignr m1, m0, 1
30150punpcklbw m0, m1
30151pmaddubsw m3, m0, m6
30152pmulhrsw m3, m7
30153movu m2, [r3 + 15]
30154movd m4, [r3 + 16]
30155palignr m4, m2, 1
30156punpcklbw m2, m4
30157pmaddubsw m5, m2, m6
30158pmulhrsw m5, m7
30159packuswb m3, m5
30160movu [r0 + 1770 * 16], m3
30161
30162movu m1, [r3 + 23]
30163movd m3, [r3 + 24]
30164palignr m3, m1, 1
30165punpcklbw m1, m3
30166pmaddubsw m3, m1, m6
30167pmulhrsw m3, m7
30168movu m4, [r3 + 31]
30169movd m5, [r3 + 32]
30170palignr m5, m4, 1
30171punpcklbw m4, m5
30172pmaddubsw m5, m4, m6
30173pmulhrsw m5, m7
30174packuswb m3, m5
30175movu [r0 + 1771 * 16], m3
30176
30177; mode 29 [row 22]
30178movu m6, [r5 + 15 * 16]
30179pmaddubsw m3, m0, m6
30180pmulhrsw m3, m7
30181pmaddubsw m5, m2, m6
30182pmulhrsw m5, m7
30183packuswb m3, m5
30184movu [r0 + 1772 * 16], m3
30185pmaddubsw m3, m1, m6
30186pmulhrsw m3, m7
30187pmaddubsw m5, m4, m6
30188pmulhrsw m5, m7
30189packuswb m3, m5
30190movu [r0 + 1773 * 16], m3
30191
30192; mode 29 [row 23]
30193movu m6, [r5 + 24 * 16]
30194pmaddubsw m3, m0, m6
30195pmulhrsw m3, m7
30196pmaddubsw m5, m2, m6
30197pmulhrsw m5, m7
30198packuswb m3, m5
30199movu [r0 + 1774 * 16], m3
30200pmaddubsw m3, m1, m6
30201pmulhrsw m3, m7
30202pmaddubsw m5, m4, m6
30203pmulhrsw m5, m7
30204packuswb m3, m5
30205movu [r0 + 1775 * 16], m3
30206
30207; mode 30 [row 14]
30208movu m6, [r5 + 3 * 16]
30209pmaddubsw m3, m0, m6
30210pmulhrsw m3, m7
30211pmaddubsw m5, m2, m6
30212pmulhrsw m5, m7
30213packuswb m3, m5
30214movu [r0 + 1820 * 16], m3
30215pmaddubsw m3, m1, m6
30216pmulhrsw m3, m7
30217pmaddubsw m5, m4, m6
30218pmulhrsw m5, m7
30219packuswb m3, m5
30220movu [r0 + 1821 * 16], m3
30221
30222; mode 30 [row 15]
30223movu m6, [r5 + 16 * 16]
30224pmaddubsw m3, m0, m6
30225pmulhrsw m3, m7
30226pmaddubsw m5, m2, m6
30227pmulhrsw m5, m7
30228packuswb m3, m5
30229movu [r0 + 1822 * 16], m3
30230
30231; mode 33 [row 7 - first half]
30232movu [r0 + 1998 * 16], m3
30233
30234pmaddubsw m3, m1, m6
30235pmulhrsw m3, m7
30236pmaddubsw m5, m4, m6
30237pmulhrsw m5, m7
30238packuswb m3, m5
30239movu [r0 + 1823 * 16], m3
30240
30241; mode 33 [row 7 - second half]
30242movu [r0 + 1999 * 16], m3
30243
30244; mode 30 [row 16]
30245movu m6, [r5 + 29 * 16]
30246pmaddubsw m3, m0, m6
30247pmulhrsw m3, m7
30248pmaddubsw m5, m2, m6
30249pmulhrsw m5, m7
30250packuswb m3, m5
30251movu [r0 + 1824 * 16], m3
30252
30253; mode 31 [row 12 - first half]
30254movu [r0 + 1880 * 16], m3
30255
30256pmaddubsw m3, m1, m6
30257pmulhrsw m3, m7
30258pmaddubsw m5, m4, m6
30259pmulhrsw m5, m7
30260packuswb m3, m5
30261movu [r0 + 1825 * 16], m3
30262
30263; mode 31 [row 12 - second half]
30264movu [r0 + 1881 * 16], m3
30265
30266; mode 31 [row 11]
30267movu m6, [r5 + 12 * 16]
30268pmaddubsw m3, m0, m6
30269pmulhrsw m3, m7
30270pmaddubsw m5, m2, m6
30271pmulhrsw m5, m7
30272packuswb m3, m5
30273movu [r0 + 1878 * 16], m3
30274pmaddubsw m3, m1, m6
30275pmulhrsw m3, m7
30276pmaddubsw m5, m4, m6
30277pmulhrsw m5, m7
30278packuswb m3, m5
30279movu [r0 + 1879 * 16], m3
30280
30281; mode 32 [row 9]
30282movu m6, [r5 + 18 * 16]
30283pmaddubsw m3, m0, m6
30284pmulhrsw m3, m7
30285pmaddubsw m5, m2, m6
30286pmulhrsw m5, m7
30287packuswb m3, m5
30288movu [r0 + 1938 * 16], m3
30289pmaddubsw m3, m1, m6
30290pmulhrsw m3, m7
30291pmaddubsw m5, m4, m6
30292pmulhrsw m5, m7
30293packuswb m3, m5
30294movu [r0 + 1939 * 16], m3
30295
30296; mode 29 [row 24]
30297movu m6, [r5 + 1 * 16]
30298movu m0, [r3 + 8]
30299movd m1, [r3 + 9]
30300palignr m1, m0, 1
30301punpcklbw m0, m1
30302pmaddubsw m3, m0, m6
30303pmulhrsw m3, m7
30304movu m2, [r3 + 16]
30305movd m4, [r3 + 17]
30306palignr m4, m2, 1
30307punpcklbw m2, m4
30308pmaddubsw m5, m2, m6
30309pmulhrsw m5, m7
30310packuswb m3, m5
30311movu [r0 + 1776 * 16], m3
30312
30313movu m1, [r3 + 24]
30314movd m3, [r3 + 25]
30315palignr m3, m1, 1
30316punpcklbw m1, m3
30317pmaddubsw m3, m1, m6
30318pmulhrsw m3, m7
30319movu m4, [r3 + 32]
30320movd m5, [r3 + 33]
30321palignr m5, m4, 1
30322punpcklbw m4, m5
30323pmaddubsw m5, m4, m6
30324pmulhrsw m5, m7
30325packuswb m3, m5
30326movu [r0 + 1777 * 16], m3
30327
30328; mode 29 [row 25]
30329movu m6, [r5 + 10 * 16]
30330pmaddubsw m3, m0, m6
30331pmulhrsw m3, m7
30332pmaddubsw m5, m2, m6
30333pmulhrsw m5, m7
30334packuswb m3, m5
30335movu [r0 + 1778 * 16], m3
30336
30337; mode 30 [row 17 - first half]
30338movu [r0 + 1826 * 16], m3
30339
30340; mode 33 [row 8 - first half]
30341movu [r0 + 2000 * 16], m3
30342
30343pmaddubsw m3, m1, m6
30344pmulhrsw m3, m7
30345pmaddubsw m5, m4, m6
30346pmulhrsw m5, m7
30347packuswb m3, m5
30348movu [r0 + 1779 * 16], m3
30349
30350; mode 30 [row 17 - second half]
30351movu [r0 + 1827 * 16], m3
30352
30353; mode 33 [row 8 - second half]
30354movu [r0 + 2001 * 16], m3
30355
30356; mode 29 [row 26]
30357movu m6, [r5 + 19 * 16]
30358pmaddubsw m3, m0, m6
30359pmulhrsw m3, m7
30360pmaddubsw m5, m2, m6
30361pmulhrsw m5, m7
30362packuswb m3, m5
30363movu [r0 + 1780 * 16], m3
30364pmaddubsw m3, m1, m6
30365pmulhrsw m3, m7
30366pmaddubsw m5, m4, m6
30367pmulhrsw m5, m7
30368packuswb m3, m5
30369movu [r0 + 1781 * 16], m3
30370
30371; mode 29 [row 27]
30372movu m6, [r5 + 28 * 16]
30373pmaddubsw m3, m0, m6
30374pmulhrsw m3, m7
30375pmaddubsw m5, m2, m6
30376pmulhrsw m5, m7
30377packuswb m3, m5
30378movu [r0 + 1782 * 16], m3
30379
30380; mode 32 [row 11 - first half]
30381movu [r0 + 1942 * 16], m3
30382
30383pmaddubsw m3, m1, m6
30384pmulhrsw m3, m7
30385pmaddubsw m5, m4, m6
30386pmulhrsw m5, m7
30387packuswb m3, m5
30388movu [r0 + 1783 * 16], m3
30389
30390; mode 32 [row 11 - second half]
30391movu [r0 + 1943 * 16], m3
30392
30393; mode 30 [row 18]
30394movu m6, [r5 + 23 * 16]
30395pmaddubsw m3, m0, m6
30396pmulhrsw m3, m7
30397pmaddubsw m5, m2, m6
30398pmulhrsw m5, m7
30399packuswb m3, m5
30400movu [r0 + 1828 * 16], m3
30401pmaddubsw m3, m1, m6
30402pmulhrsw m3, m7
30403pmaddubsw m5, m4, m6
30404pmulhrsw m5, m7
30405packuswb m3, m5
30406movu [r0 + 1829 * 16], m3
30407
30408; mode 31 [row 13]
30409movu m6, [r5 + 14 * 16]
30410pmaddubsw m3, m0, m6
30411pmulhrsw m3, m7
30412pmaddubsw m5, m2, m6
30413pmulhrsw m5, m7
30414packuswb m3, m5
30415movu [r0 + 1882 * 16], m3
30416pmaddubsw m3, m1, m6
30417pmulhrsw m3, m7
30418pmaddubsw m5, m4, m6
30419pmulhrsw m5, m7
30420packuswb m3, m5
30421movu [r0 + 1883 * 16], m3
30422
30423; mode 31 [row 14]
30424movu m6, [r5 + 31 * 16]
30425pmaddubsw m3, m0, m6
30426pmulhrsw m3, m7
30427pmaddubsw m5, m2, m6
30428pmulhrsw m5, m7
30429packuswb m3, m5
30430movu [r0 + 1884 * 16], m3
30431pmaddubsw m3, m1, m6
30432pmulhrsw m3, m7
30433pmaddubsw m5, m4, m6
30434pmulhrsw m5, m7
30435packuswb m3, m5
30436movu [r0 + 1885 * 16], m3
30437
30438; mode 32 [row 10]
30439movu m6, [r5 + 7 * 16]
30440pmaddubsw m3, m0, m6
30441pmulhrsw m3, m7
30442pmaddubsw m5, m2, m6
30443pmulhrsw m5, m7
30444packuswb m3, m5
30445movu [r0 + 1940 * 16], m3
30446pmaddubsw m3, m1, m6
30447pmulhrsw m3, m7
30448pmaddubsw m5, m4, m6
30449pmulhrsw m5, m7
30450packuswb m3, m5
30451movu [r0 + 1941 * 16], m3
30452
30453; mode 29 [row 28]
30454movu m6, [r5 + 5 * 16]
30455movu m0, [r3 + 9]
30456movd m1, [r3 + 10]
30457palignr m1, m0, 1
30458punpcklbw m0, m1
30459pmaddubsw m3, m0, m6
30460pmulhrsw m3, m7
30461movu m2, [r3 + 17]
30462movd m4, [r3 + 18]
30463palignr m4, m2, 1
30464punpcklbw m2, m4
30465pmaddubsw m5, m2, m6
30466pmulhrsw m5, m7
30467packuswb m3, m5
30468movu [r0 + 1784 * 16], m3
30469
30470movu m1, [r3 + 25]
30471movd m3, [r3 + 26]
30472palignr m3, m1, 1
30473punpcklbw m1, m3
30474pmaddubsw m3, m1, m6
30475pmulhrsw m3, m7
30476movu m4, [r3 + 33]
30477movd m5, [r3 + 34]
30478palignr m5, m4, 1
30479punpcklbw m4, m5
30480pmaddubsw m5, m4, m6
30481pmulhrsw m5, m7
30482packuswb m3, m5
30483movu [r0 + 1785 * 16], m3
30484
30485; mode 29 [row 29]
30486movu m6, [r5 + 14 * 16]
30487pmaddubsw m3, m0, m6
30488pmulhrsw m3, m7
30489pmaddubsw m5, m2, m6
30490pmulhrsw m5, m7
30491packuswb m3, m5
30492movu [r0 + 1786 * 16], m3
30493pmaddubsw m3, m1, m6
30494pmulhrsw m3, m7
30495pmaddubsw m5, m4, m6
30496pmulhrsw m5, m7
30497packuswb m3, m5
30498movu [r0 + 1787 * 16], m3
30499
30500; mode 29 [row 30]
30501movu m6, [r5 + 23 * 16]
30502pmaddubsw m3, m0, m6
30503pmulhrsw m3, m7
30504pmaddubsw m5, m2, m6
30505pmulhrsw m5, m7
30506packuswb m3, m5
30507movu [r0 + 1788 * 16], m3
30508pmaddubsw m3, m1, m6
30509pmulhrsw m3, m7
30510pmaddubsw m5, m4, m6
30511pmulhrsw m5, m7
30512packuswb m3, m5
30513movu [r0 + 1789 * 16], m3
30514
30515; mode 30 [row 19]
30516movu m6, [r5 + 4 * 16]
30517pmaddubsw m3, m0, m6
30518pmulhrsw m3, m7
30519pmaddubsw m5, m2, m6
30520pmulhrsw m5, m7
30521packuswb m3, m5
30522movu [r0 + 1830 * 16], m3
30523
30524; mode 33 [row 9 - first half]
30525movu [r0 + 2002 * 16], m3
30526
30527pmaddubsw m3, m1, m6
30528pmulhrsw m3, m7
30529pmaddubsw m5, m4, m6
30530pmulhrsw m5, m7
30531packuswb m3, m5
30532movu [r0 + 1831 * 16], m3
30533
30534; mode 33 [row 9 - second half]
30535movu [r0 + 2003 * 16], m3
30536
30537; mode 30 [row 20]
30538movu m6, [r5 + 17 * 16]
30539pmaddubsw m3, m0, m6
30540pmulhrsw m3, m7
30541pmaddubsw m5, m2, m6
30542pmulhrsw m5, m7
30543packuswb m3, m5
30544movu [r0 + 1832 * 16], m3
30545
30546; mode 32 [row 12 - first half]
30547movu [r0 + 1944 * 16], m3
30548
30549pmaddubsw m3, m1, m6
30550pmulhrsw m3, m7
30551pmaddubsw m5, m4, m6
30552pmulhrsw m5, m7
30553packuswb m3, m5
30554movu [r0 + 1833 * 16], m3
30555
30556; mode 32 [row 12 - second half]
30557movu [r0 + 1945 * 16], m3
30558
30559; mode 30 [row 21]
30560movu m6, [r5 + 30 * 16]
30561pmaddubsw m3, m0, m6
30562pmulhrsw m3, m7
30563pmaddubsw m5, m2, m6
30564pmulhrsw m5, m7
30565packuswb m3, m5
30566movu [r0 + 1834 * 16], m3
30567
30568; mode 33 [row 10 - first half]
30569movu [r0 + 2004 * 16], m3
30570
30571pmaddubsw m3, m1, m6
30572pmulhrsw m3, m7
30573pmaddubsw m5, m4, m6
30574pmulhrsw m5, m7
30575packuswb m3, m5
30576movu [r0 + 1835 * 16], m3
30577
30578; mode 33 [row 10 - second half]
30579movu [r0 + 2005 * 16], m3
30580
30581; mode 31 [row 15]
30582movu m6, [r5 + 16 * 16]
30583pmaddubsw m3, m0, m6
30584pmulhrsw m3, m7
30585pmaddubsw m5, m2, m6
30586pmulhrsw m5, m7
30587packuswb m3, m5
30588movu [r0 + 1886 * 16], m3
30589pmaddubsw m3, m1, m6
30590pmulhrsw m3, m7
30591pmaddubsw m5, m4, m6
30592pmulhrsw m5, m7
30593packuswb m3, m5
30594movu [r0 + 1887 * 16], m3
30595
30596; mode 29 [row 31]
30597movu m0, [r3 + 10]
30598movd m1, [r3 + 11]
30599palignr m1, m0, 1
30600punpcklbw m0, m1
30601movu m2, [r3 + 18]
30602movd m3, [r3 + 19]
30603palignr m3, m2, 1
30604punpcklbw m2, m3
30605movu m1, [r3 + 26]
30606movd m3, [r3 + 27]
30607palignr m3, m1, 1
30608punpcklbw m1, m3
30609movu m4, [r3 + 34]
30610movd m5, [r3 + 35]
30611palignr m5, m4, 1
30612punpcklbw m4, m5
30613
30614pshufb m5, m0, [tab_S2]
30615movh [r0 + 1790 * 16], m5
30616pshufb m5, m2, [tab_S2]
30617movh [r0 + 1790 * 16 + 8], m5
30618pshufb m5, m1, [tab_S2]
30619movh [r0 + 1791 * 16], m5
30620pshufb m5, m4, [tab_S2]
30621movh [r0 + 1791 * 16 + 8], m5
30622
30623; mode 30 [row 22]
30624movu m6, [r5 + 11 * 16]
30625pmaddubsw m3, m0, m6
30626pmulhrsw m3, m7
30627pmaddubsw m5, m2, m6
30628pmulhrsw m5, m7
30629packuswb m3, m5
30630movu [r0 + 1836 * 16], m3
30631pmaddubsw m3, m1, m6
30632pmulhrsw m3, m7
30633pmaddubsw m5, m4, m6
30634pmulhrsw m5, m7
30635packuswb m3, m5
30636movu [r0 + 1837 * 16], m3
30637
30638; mode 30 [row 23]
30639movu m6, [r5 + 24 * 16]
30640pmaddubsw m3, m0, m6
30641pmulhrsw m3, m7
30642pmaddubsw m5, m2, m6
30643pmulhrsw m5, m7
30644packuswb m3, m5
30645movu [r0 + 1838 * 16], m3
30646
30647; mode 33 [row 11 - first half]
30648movu [r0 + 2006 * 16], m3
30649
30650pmaddubsw m3, m1, m6
30651pmulhrsw m3, m7
30652pmaddubsw m5, m4, m6
30653pmulhrsw m5, m7
30654packuswb m3, m5
30655movu [r0 + 1839 * 16], m3
30656
30657; mode 33 [row 11 - second half]
30658movu [r0 + 2007 * 16], m3
30659
30660; mode 31 [row 16]
30661movu m6, [r5 + 1 * 16]
30662pmaddubsw m3, m0, m6
30663pmulhrsw m3, m7
30664pmaddubsw m5, m2, m6
30665pmulhrsw m5, m7
30666packuswb m3, m5
30667movu [r0 + 1888 * 16], m3
30668pmaddubsw m3, m1, m6
30669pmulhrsw m3, m7
30670pmaddubsw m5, m4, m6
30671pmulhrsw m5, m7
30672packuswb m3, m5
30673movu [r0 + 1889 * 16], m3
30674
30675; mode 31 [row 17]
30676movu m6, [r5 + 18 * 16]
30677pmaddubsw m3, m0, m6
30678pmulhrsw m3, m7
30679pmaddubsw m5, m2, m6
30680pmulhrsw m5, m7
30681packuswb m3, m5
30682movu [r0 + 1890 * 16], m3
30683pmaddubsw m3, m1, m6
30684pmulhrsw m3, m7
30685pmaddubsw m5, m4, m6
30686pmulhrsw m5, m7
30687packuswb m3, m5
30688movu [r0 + 1891 * 16], m3
30689
30690; mode 32 [row 13]
30691movu m6, [r5 + 6 * 16]
30692pmaddubsw m3, m0, m6
30693pmulhrsw m3, m7
30694pmaddubsw m5, m2, m6
30695pmulhrsw m5, m7
30696packuswb m3, m5
30697movu [r0 + 1946 * 16], m3
30698pmaddubsw m3, m1, m6
30699pmulhrsw m3, m7
30700pmaddubsw m5, m4, m6
30701pmulhrsw m5, m7
30702packuswb m3, m5
30703movu [r0 + 1947 * 16], m3
30704
30705; mode 32 [row 14]
30706movu m6, [r5 + 27 * 16]
30707pmaddubsw m3, m0, m6
30708pmulhrsw m3, m7
30709pmaddubsw m5, m2, m6
30710pmulhrsw m5, m7
30711packuswb m3, m5
30712movu [r0 + 1948 * 16], m3
30713pmaddubsw m3, m1, m6
30714pmulhrsw m3, m7
30715pmaddubsw m5, m4, m6
30716pmulhrsw m5, m7
30717packuswb m3, m5
30718movu [r0 + 1949 * 16], m3
30719
30720; mode 30 [row 24]
30721movu m6, [r5 + 5 * 16]
30722movu m0, [r3 + 11]
30723movd m1, [r3 + 12]
30724palignr m1, m0, 1
30725punpcklbw m0, m1
30726pmaddubsw m3, m0, m6
30727pmulhrsw m3, m7
30728movu m2, [r3 + 19]
30729movd m4, [r3 + 20]
30730palignr m4, m2, 1
30731punpcklbw m2, m4
30732pmaddubsw m5, m2, m6
30733pmulhrsw m5, m7
30734packuswb m3, m5
30735movu [r0 + 1840 * 16], m3
30736
30737movu m1, [r3 + 27]
30738movd m3, [r3 + 28]
30739palignr m3, m1, 1
30740punpcklbw m1, m3
30741pmaddubsw m3, m1, m6
30742pmulhrsw m3, m7
30743movu m4, [r3 + 35]
30744movd m5, [r3 + 36]
30745palignr m5, m4, 1
30746punpcklbw m4, m5
30747pmaddubsw m5, m4, m6
30748pmulhrsw m5, m7
30749packuswb m3, m5
30750movu [r0 + 1841 * 16], m3
30751
30752; mode 30 [row 25]
30753movu m6, [r5 + 18 * 16]
30754pmaddubsw m3, m0, m6
30755pmulhrsw m3, m7
30756pmaddubsw m5, m2, m6
30757pmulhrsw m5, m7
30758packuswb m3, m5
30759movu [r0 + 1842 * 16], m3
30760
30761; mode 33 [row 12 - first half]
30762movu [r0 + 2008 * 16], m3
30763
30764pmaddubsw m3, m1, m6
30765pmulhrsw m3, m7
30766pmaddubsw m5, m4, m6
30767pmulhrsw m5, m7
30768packuswb m3, m5
30769movu [r0 + 1843 * 16], m3
30770
30771; mode 33 [row 12 - second half]
30772movu [r0 + 2009 * 16], m3
30773
30774; mode 30 [row 26]
30775movu m6, [r5 + 31 * 16]
30776pmaddubsw m3, m0, m6
30777pmulhrsw m3, m7
30778pmaddubsw m5, m2, m6
30779pmulhrsw m5, m7
30780packuswb m3, m5
30781movu [r0 + 1844 * 16], m3
30782pmaddubsw m3, m1, m6
30783pmulhrsw m3, m7
30784pmaddubsw m5, m4, m6
30785pmulhrsw m5, m7
30786packuswb m3, m5
30787movu [r0 + 1845 * 16], m3
30788
30789; mode 31 [row 18]
30790movu m6, [r5 + 3 * 16]
30791pmaddubsw m3, m0, m6
30792pmulhrsw m3, m7
30793pmaddubsw m5, m2, m6
30794pmulhrsw m5, m7
30795packuswb m3, m5
30796movu [r0 + 1892 * 16], m3
30797pmaddubsw m3, m1, m6
30798pmulhrsw m3, m7
30799pmaddubsw m5, m4, m6
30800pmulhrsw m5, m7
30801packuswb m3, m5
30802movu [r0 + 1893 * 16], m3
30803
30804; mode 31 [row 19]
30805movu m6, [r5 + 20 * 16]
30806pmaddubsw m3, m0, m6
30807pmulhrsw m3, m7
30808pmaddubsw m5, m2, m6
30809pmulhrsw m5, m7
30810packuswb m3, m5
30811movu [r0 + 1894 * 16], m3
30812pmaddubsw m3, m1, m6
30813pmulhrsw m3, m7
30814pmaddubsw m5, m4, m6
30815pmulhrsw m5, m7
30816packuswb m3, m5
30817movu [r0 + 1895 * 16], m3
30818
30819; mode 32 [row 15]
30820movu m6, [r5 + 16 * 16]
30821pmaddubsw m3, m0, m6
30822pmulhrsw m3, m7
30823pmaddubsw m5, m2, m6
30824pmulhrsw m5, m7
30825packuswb m3, m5
30826movu [r0 + 1950 * 16], m3
30827pmaddubsw m3, m1, m6
30828pmulhrsw m3, m7
30829pmaddubsw m5, m4, m6
30830pmulhrsw m5, m7
30831packuswb m3, m5
30832movu [r0 + 1951 * 16], m3
30833
30834; mode 30 [row 27]
30835movu m6, [r5 + 12 * 16]
30836movu m0, [r3 + 12]
30837movd m1, [r3 + 13]
30838palignr m1, m0, 1
30839punpcklbw m0, m1
30840pmaddubsw m3, m0, m6
30841pmulhrsw m3, m7
30842movu m2, [r3 + 20]
30843movd m4, [r3 + 21]
30844palignr m4, m2, 1
30845punpcklbw m2, m4
30846pmaddubsw m5, m2, m6
30847pmulhrsw m5, m7
30848packuswb m3, m5
30849movu [r0 + 1846 * 16], m3
30850
30851; mode 33 [row 13 - first half]
30852movu [r0 + 2010 * 16], m3
30853
30854movu m1, [r3 + 28]
30855movd m3, [r3 + 29]
30856palignr m3, m1, 1
30857punpcklbw m1, m3
30858pmaddubsw m3, m1, m6
30859pmulhrsw m3, m7
30860movu m4, [r3 + 36]
30861movd m5, [r3 + 37]
30862palignr m5, m4, 1
30863punpcklbw m4, m5
30864pmaddubsw m5, m4, m6
30865pmulhrsw m5, m7
30866packuswb m3, m5
30867movu [r0 + 1847 * 16], m3
30868
30869; mode 33 [row 13 - second half]
30870movu [r0 + 2011 * 16], m3
30871
30872; mode 30 [row 28]
30873movu m6, [r5 + 25 * 16]
30874pmaddubsw m3, m0, m6
30875pmulhrsw m3, m7
30876pmaddubsw m5, m2, m6
30877pmulhrsw m5, m7
30878packuswb m3, m5
30879movu [r0 + 1848 * 16], m3
30880pmaddubsw m3, m1, m6
30881pmulhrsw m3, m7
30882pmaddubsw m5, m4, m6
30883pmulhrsw m5, m7
30884packuswb m3, m5
30885movu [r0 + 1849 * 16], m3
30886
30887; mode 31 [row 20]
30888movu m6, [r5 + 5 * 16]
30889pmaddubsw m3, m0, m6
30890pmulhrsw m3, m7
30891pmaddubsw m5, m2, m6
30892pmulhrsw m5, m7
30893packuswb m3, m5
30894movu [r0 + 1896 * 16], m3
30895
30896; mode 32 [row 16 - first half]
30897movu [r0 + 1952 * 16], m3
30898
30899pmaddubsw m3, m1, m6
30900pmulhrsw m3, m7
30901pmaddubsw m5, m4, m6
30902pmulhrsw m5, m7
30903packuswb m3, m5
30904movu [r0 + 1897 * 16], m3
30905
30906; mode 32 [row 16 - second half]
30907movu [r0 + 1953 * 16], m3
30908
30909; mode 31 [row 21]
30910movu m6, [r5 + 22 * 16]
30911pmaddubsw m3, m0, m6
30912pmulhrsw m3, m7
30913pmaddubsw m5, m2, m6
30914pmulhrsw m5, m7
30915packuswb m3, m5
30916movu [r0 + 1898 * 16], m3
30917pmaddubsw m3, m1, m6
30918pmulhrsw m3, m7
30919pmaddubsw m5, m4, m6
30920pmulhrsw m5, m7
30921packuswb m3, m5
30922movu [r0 + 1899 * 16], m3
30923
30924; mode 32 [row 17]
30925movu m6, [r5 + 26 * 16]
30926pmaddubsw m3, m0, m6
30927pmulhrsw m3, m7
30928pmaddubsw m5, m2, m6
30929pmulhrsw m5, m7
30930packuswb m3, m5
30931movu [r0 + 1954 * 16], m3
30932pmaddubsw m3, m1, m6
30933pmulhrsw m3, m7
30934pmaddubsw m5, m4, m6
30935pmulhrsw m5, m7
30936packuswb m3, m5
30937movu [r0 + 1955 * 16], m3
30938
30939; mode 30 [row 29]
30940movu m6, [r5 + 6 * 16]
30941movu m0, [r3 + 13]
30942movd m1, [r3 + 14]
30943palignr m1, m0, 1
30944punpcklbw m0, m1
30945pmaddubsw m3, m0, m6
30946pmulhrsw m3, m7
30947movu m2, [r3 + 21]
30948movd m4, [r3 + 22]
30949palignr m4, m2, 1
30950punpcklbw m2, m4
30951pmaddubsw m5, m2, m6
30952pmulhrsw m5, m7
30953packuswb m3, m5
30954movu [r0 + 1850 * 16], m3
30955
30956; mode 33 [row 14 - first half]
30957movu [r0 + 2012 * 16], m3
30958
30959movu m1, [r3 + 29]
30960movd m3, [r3 + 30]
30961palignr m3, m1, 1
30962punpcklbw m1, m3
30963pmaddubsw m3, m1, m6
30964pmulhrsw m3, m7
30965movu m4, [r3 + 37]
30966movd m5, [r3 + 38]
30967palignr m5, m4, 1
30968punpcklbw m4, m5
30969pmaddubsw m5, m4, m6
30970pmulhrsw m5, m7
30971packuswb m3, m5
30972movu [r0 + 1851 * 16], m3
30973
30974; mode 33 [row 14 - second half]
30975movu [r0 + 2013 * 16], m3
30976
30977; mode 30 [row 30]
30978movu m6, [r5 + 19 * 16]
30979pmaddubsw m3, m0, m6
30980pmulhrsw m3, m7
30981pmaddubsw m5, m2, m6
30982pmulhrsw m5, m7
30983packuswb m3, m5
30984movu [r0 + 1852 * 16], m3
30985pmaddubsw m3, m1, m6
30986pmulhrsw m3, m7
30987pmaddubsw m5, m4, m6
30988pmulhrsw m5, m7
30989packuswb m3, m5
30990movu [r0 + 1853 * 16], m3
30991
30992; mode 31 [row 22]
30993movu m6, [r5 + 7 * 16]
30994pmaddubsw m3, m0, m6
30995pmulhrsw m3, m7
30996pmaddubsw m5, m2, m6
30997pmulhrsw m5, m7
30998packuswb m3, m5
30999movu [r0 + 1900 * 16], m3
31000pmaddubsw m3, m1, m6
31001pmulhrsw m3, m7
31002pmaddubsw m5, m4, m6
31003pmulhrsw m5, m7
31004packuswb m3, m5
31005movu [r0 + 1901 * 16], m3
31006
31007; mode 31 [row 23]
31008movu m6, [r5 + 24 * 16]
31009pmaddubsw m3, m0, m6
31010pmulhrsw m3, m7
31011pmaddubsw m5, m2, m6
31012pmulhrsw m5, m7
31013packuswb m3, m5
31014movu [r0 + 1902 * 16], m3
31015pmaddubsw m3, m1, m6
31016pmulhrsw m3, m7
31017pmaddubsw m5, m4, m6
31018pmulhrsw m5, m7
31019packuswb m3, m5
31020movu [r0 + 1903 * 16], m3
31021
31022; mode 32 [row 18]
31023movu m6, [r5 + 15 * 16]
31024pmaddubsw m3, m0, m6
31025pmulhrsw m3, m7
31026pmaddubsw m5, m2, m6
31027pmulhrsw m5, m7
31028packuswb m3, m5
31029movu [r0 + 1956 * 16], m3
31030pmaddubsw m3, m1, m6
31031pmulhrsw m3, m7
31032pmaddubsw m5, m4, m6
31033pmulhrsw m5, m7
31034packuswb m3, m5
31035movu [r0 + 1957 * 16], m3
31036
31037; mode 30 [row 31]
31038movu m0, [r3 + 14]
31039movd m1, [r3 + 15]
31040palignr m1, m0, 1
31041punpcklbw m0, m1
31042movu m2, [r3 + 22]
31043movd m3, [r3 + 23]
31044palignr m3, m2, 1
31045punpcklbw m2, m3
31046movu m1, [r3 + 30]
31047movd m3, [r3 + 31]
31048palignr m3, m1, 1
31049punpcklbw m1, m3
31050movu m4, [r3 + 38]
31051movd m5, [r3 + 39]
31052palignr m5, m4, 1
31053punpcklbw m4, m5
31054
31055pshufb m5, m0, [tab_S2]
31056movh [r0 + 1854 * 16], m5
31057
31058; mode 33 [row 15 - first eight]
31059movh [r0 + 2014 * 16], m5
31060
31061pshufb m5, m2, [tab_S2]
31062movh [r0 + 1854 * 16 + 8], m5
31063
31064; mode 33 [row 15 - second eight]
31065movh [r0 + 2014 * 16 + 8], m5
31066
31067pshufb m5, m1, [tab_S2]
31068movh [r0 + 1855 * 16], m5
31069
31070; mode 33 [row 15 - third eight]
31071movh [r0 + 2015 * 16], m5
31072
31073pshufb m5, m4, [tab_S2]
31074movh [r0 + 1855 * 16 + 8], m5
31075
31076; mode 33 [row 15 - fourth eight]
31077movh [r0 + 2015 * 16 + 8], m5
31078
31079; mode 31 [row 24]
31080movu m6, [r5 + 9 * 16]
31081pmaddubsw m3, m0, m6
31082pmulhrsw m3, m7
31083pmaddubsw m5, m2, m6
31084pmulhrsw m5, m7
31085packuswb m3, m5
31086movu [r0 + 1904 * 16], m3
31087pmaddubsw m3, m1, m6
31088pmulhrsw m3, m7
31089pmaddubsw m5, m4, m6
31090pmulhrsw m5, m7
31091packuswb m3, m5
31092movu [r0 + 1905 * 16], m3
31093
31094; mode 31 [row 25]
31095movu m6, [r5 + 26 * 16]
31096pmaddubsw m3, m0, m6
31097pmulhrsw m3, m7
31098pmaddubsw m5, m2, m6
31099pmulhrsw m5, m7
31100packuswb m3, m5
31101movu [r0 + 1906 * 16], m3
31102
31103; mode 33 [row 16 - first half]
31104movu [r0 + 2016 * 16], m3
31105
31106pmaddubsw m3, m1, m6
31107pmulhrsw m3, m7
31108pmaddubsw m5, m4, m6
31109pmulhrsw m5, m7
31110packuswb m3, m5
31111movu [r0 + 1907 * 16], m3
31112
31113; mode 33 [row 16 - second half]
31114movu [r0 + 2017 * 16], m3
31115
31116; mode 32 [row 19]
31117movu m6, [r5 + 4 * 16]
31118pmaddubsw m3, m0, m6
31119pmulhrsw m3, m7
31120pmaddubsw m5, m2, m6
31121pmulhrsw m5, m7
31122packuswb m3, m5
31123movu [r0 + 1958 * 16], m3
31124pmaddubsw m3, m1, m6
31125pmulhrsw m3, m7
31126pmaddubsw m5, m4, m6
31127pmulhrsw m5, m7
31128packuswb m3, m5
31129movu [r0 + 1959 * 16], m3
31130
31131; mode 32 [row 20]
31132movu m6, [r5 + 25 * 16]
31133pmaddubsw m3, m0, m6
31134pmulhrsw m3, m7
31135pmaddubsw m5, m2, m6
31136pmulhrsw m5, m7
31137packuswb m3, m5
31138movu [r0 + 1960 * 16], m3
31139pmaddubsw m3, m1, m6
31140pmulhrsw m3, m7
31141pmaddubsw m5, m4, m6
31142pmulhrsw m5, m7
31143packuswb m3, m5
31144movu [r0 + 1961 * 16], m3
31145
31146; mode 31 [row 26]
31147movu m6, [r5 + 11 * 16]
31148movu m0, [r3 + 15]
31149movd m1, [r3 + 16]
31150palignr m1, m0, 1
31151punpcklbw m0, m1
31152pmaddubsw m3, m0, m6
31153pmulhrsw m3, m7
31154movu m2, [r3 + 23]
31155movd m4, [r3 + 24]
31156palignr m4, m2, 1
31157punpcklbw m2, m4
31158pmaddubsw m5, m2, m6
31159pmulhrsw m5, m7
31160packuswb m3, m5
31161movu [r0 + 1908 * 16], m3
31162
31163movu m1, [r3 + 31]
31164movd m3, [r3 + 32]
31165palignr m3, m1, 1
31166punpcklbw m1, m3
31167pmaddubsw m3, m1, m6
31168pmulhrsw m3, m7
31169movu m4, [r3 + 39]
31170movd m5, [r3 + 40]
31171palignr m5, m4, 1
31172punpcklbw m4, m5
31173pmaddubsw m5, m4, m6
31174pmulhrsw m5, m7
31175packuswb m3, m5
31176movu [r0 + 1909 * 16], m3
31177
31178; mode 31 [row 27]
31179movu m6, [r5 + 28 * 16]
31180pmaddubsw m3, m0, m6
31181pmulhrsw m3, m7
31182pmaddubsw m5, m2, m6
31183pmulhrsw m5, m7
31184packuswb m3, m5
31185movu [r0 + 1910 * 16], m3
31186pmaddubsw m3, m1, m6
31187pmulhrsw m3, m7
31188pmaddubsw m5, m4, m6
31189pmulhrsw m5, m7
31190packuswb m3, m5
31191movu [r0 + 1911 * 16], m3
31192
31193; mode 32 [row 21]
31194movu m6, [r5 + 14 * 16]
31195pmaddubsw m3, m0, m6
31196pmulhrsw m3, m7
31197pmaddubsw m5, m2, m6
31198pmulhrsw m5, m7
31199packuswb m3, m5
31200movu [r0 + 1962 * 16], m3
31201pmaddubsw m3, m1, m6
31202pmulhrsw m3, m7
31203pmaddubsw m5, m4, m6
31204pmulhrsw m5, m7
31205packuswb m3, m5
31206movu [r0 + 1963 * 16], m3
31207
31208; mode 33 [row 17]
31209movu m6, [r5 + 20 * 16]
31210pmaddubsw m3, m0, m6
31211pmulhrsw m3, m7
31212pmaddubsw m5, m2, m6
31213pmulhrsw m5, m7
31214packuswb m3, m5
31215movu [r0 + 2018 * 16], m3
31216pmaddubsw m3, m1, m6
31217pmulhrsw m3, m7
31218pmaddubsw m5, m4, m6
31219pmulhrsw m5, m7
31220packuswb m3, m5
31221movu [r0 + 2019 * 16], m3
31222
31223; mode 31 [row 28]
31224movu m6, [r5 + 13 * 16]
31225movu m0, [r3 + 16]
31226movd m1, [r3 + 17]
31227palignr m1, m0, 1
31228punpcklbw m0, m1
31229pmaddubsw m3, m0, m6
31230pmulhrsw m3, m7
31231movu m2, [r3 + 24]
31232movd m4, [r3 + 25]
31233palignr m4, m2, 1
31234punpcklbw m2, m4
31235pmaddubsw m5, m2, m6
31236pmulhrsw m5, m7
31237packuswb m3, m5
31238movu [r0 + 1912 * 16], m3
31239
31240movu m1, [r3 + 32]
31241movd m3, [r3 + 33]
31242palignr m3, m1, 1
31243punpcklbw m1, m3
31244pmaddubsw m3, m1, m6
31245pmulhrsw m3, m7
31246movu m4, [r3 + 40]
31247movd m5, [r3 + 41]
31248palignr m5, m4, 1
31249punpcklbw m4, m5
31250pmaddubsw m5, m4, m6
31251pmulhrsw m5, m7
31252packuswb m3, m5
31253movu [r0 + 1913 * 16], m3
31254
31255; mode 31 [row 29]
31256movu m6, [r5 + 30 * 16]
31257pmaddubsw m3, m0, m6
31258pmulhrsw m3, m7
31259pmaddubsw m5, m2, m6
31260pmulhrsw m5, m7
31261packuswb m3, m5
31262movu [r0 + 1914 * 16], m3
31263pmaddubsw m3, m1, m6
31264pmulhrsw m3, m7
31265pmaddubsw m5, m4, m6
31266pmulhrsw m5, m7
31267packuswb m3, m5
31268movu [r0 + 1915 * 16], m3
31269
31270; mode 32 [row 22]
31271movu m6, [r5 + 3 * 16]
31272pmaddubsw m3, m0, m6
31273pmulhrsw m3, m7
31274pmaddubsw m5, m2, m6
31275pmulhrsw m5, m7
31276packuswb m3, m5
31277movu [r0 + 1964 * 16], m3
31278pmaddubsw m3, m1, m6
31279pmulhrsw m3, m7
31280pmaddubsw m5, m4, m6
31281pmulhrsw m5, m7
31282packuswb m3, m5
31283movu [r0 + 1965 * 16], m3
31284
31285; mode 32 [row 23]
31286movu m6, [r5 + 24 * 16]
31287pmaddubsw m3, m0, m6
31288pmulhrsw m3, m7
31289pmaddubsw m5, m2, m6
31290pmulhrsw m5, m7
31291packuswb m3, m5
31292movu [r0 + 1966 * 16], m3
31293pmaddubsw m3, m1, m6
31294pmulhrsw m3, m7
31295pmaddubsw m5, m4, m6
31296pmulhrsw m5, m7
31297packuswb m3, m5
31298movu [r0 + 1967 * 16], m3
31299
31300; mode 33 [row 18]
31301movu m6, [r5 + 14 * 16]
31302pmaddubsw m3, m0, m6
31303pmulhrsw m3, m7
31304pmaddubsw m5, m2, m6
31305pmulhrsw m5, m7
31306packuswb m3, m5
31307movu [r0 + 2020 * 16], m3
31308pmaddubsw m3, m1, m6
31309pmulhrsw m3, m7
31310pmaddubsw m5, m4, m6
31311pmulhrsw m5, m7
31312packuswb m3, m5
31313movu [r0 + 2021 * 16], m3
31314
31315; mode 31 [row 30]
31316movu m6, [r5 + 15 * 16]
31317movu m0, [r3 + 17]
31318movd m1, [r3 + 18]
31319palignr m1, m0, 1
31320punpcklbw m0, m1
31321pmaddubsw m3, m0, m6
31322pmulhrsw m3, m7
31323movu m2, [r3 + 25]
31324movd m4, [r3 + 26]
31325palignr m4, m2, 1
31326punpcklbw m2, m4
31327pmaddubsw m5, m2, m6
31328pmulhrsw m5, m7
31329packuswb m3, m5
31330movu [r0 + 1916 * 16], m3
31331
31332movu m1, [r3 + 33]
31333movd m3, [r3 + 34]
31334palignr m3, m1, 1
31335punpcklbw m1, m3
31336pmaddubsw m3, m1, m6
31337pmulhrsw m3, m7
31338movu m4, [r3 + 41]
31339movd m5, [r3 + 42]
31340palignr m5, m4, 1
31341punpcklbw m4, m5
31342pmaddubsw m5, m4, m6
31343pmulhrsw m5, m7
31344packuswb m3, m5
31345movu [r0 + 1917 * 16], m3
31346
31347; mode 32 [row 24]
31348movu m6, [r5 + 13 * 16]
31349pmaddubsw m3, m0, m6
31350pmulhrsw m3, m7
31351pmaddubsw m5, m2, m6
31352pmulhrsw m5, m7
31353packuswb m3, m5
31354movu [r0 + 1968 * 16], m3
31355pmaddubsw m3, m1, m6
31356pmulhrsw m3, m7
31357pmaddubsw m5, m4, m6
31358pmulhrsw m5, m7
31359packuswb m3, m5
31360movu [r0 + 1969 * 16], m3
31361
31362; mode 33 [row 19]
31363movu m6, [r5 + 8 * 16]
31364pmaddubsw m3, m0, m6
31365pmulhrsw m3, m7
31366pmaddubsw m5, m2, m6
31367pmulhrsw m5, m7
31368packuswb m3, m5
31369movu [r0 + 2022 * 16], m3
31370pmaddubsw m3, m1, m6
31371pmulhrsw m3, m7
31372pmaddubsw m5, m4, m6
31373pmulhrsw m5, m7
31374packuswb m3, m5
31375movu [r0 + 2023 * 16], m3
31376
31377; mode 31 [row 31]
31378movu m0, [r3 + 18]
31379movd m1, [r3 + 19]
31380palignr m1, m0, 1
31381punpcklbw m0, m1
31382movu m2, [r3 + 26]
31383movd m3, [r3 + 27]
31384palignr m3, m2, 1
31385punpcklbw m2, m3
31386movu m1, [r3 + 34]
31387movd m3, [r3 + 35]
31388palignr m3, m1, 1
31389punpcklbw m1, m3
31390movu m4, [r3 + 42]
31391movd m5, [r3 + 43]
31392palignr m5, m4, 1
31393punpcklbw m4, m5
31394
31395pshufb m5, m0, [tab_S2]
31396movh [r0 + 1918 * 16], m5
31397pshufb m5, m2, [tab_S2]
31398movh [r0 + 1918 * 16 + 8], m5
31399pshufb m5, m1, [tab_S2]
31400movh [r0 + 1919 * 16], m5
31401pshufb m5, m4, [tab_S2]
31402movh [r0 + 1919 * 16 + 8], m5
31403
31404; mode 32 [row 25]
31405movu m6, [r5 + 2 * 16]
31406pmaddubsw m3, m0, m6
31407pmulhrsw m3, m7
31408pmaddubsw m5, m2, m6
31409pmulhrsw m5, m7
31410packuswb m3, m5
31411movu [r0 + 1970 * 16], m3
31412
31413; mode 33 [row 20 - first half]
31414movu [r0 + 2024 * 16], m3
31415
31416pmaddubsw m3, m1, m6
31417pmulhrsw m3, m7
31418pmaddubsw m5, m4, m6
31419pmulhrsw m5, m7
31420packuswb m3, m5
31421movu [r0 + 1971 * 16], m3
31422
31423; mode 33 [row 20 - second half]
31424movu [r0 + 2025 * 16], m3
31425
31426; mode 32 [row 26]
31427movu m6, [r5 + 23 * 16]
31428pmaddubsw m3, m0, m6
31429pmulhrsw m3, m7
31430pmaddubsw m5, m2, m6
31431pmulhrsw m5, m7
31432packuswb m3, m5
31433movu [r0 + 1972 * 16], m3
31434pmaddubsw m3, m1, m6
31435pmulhrsw m3, m7
31436pmaddubsw m5, m4, m6
31437pmulhrsw m5, m7
31438packuswb m3, m5
31439movu [r0 + 1973 * 16], m3
31440
31441; mode 33 [row 21]
31442movu m6, [r5 + 28 * 16]
31443pmaddubsw m3, m0, m6
31444pmulhrsw m3, m7
31445pmaddubsw m5, m2, m6
31446pmulhrsw m5, m7
31447packuswb m3, m5
31448movu [r0 + 2026 * 16], m3
31449pmaddubsw m3, m1, m6
31450pmulhrsw m3, m7
31451pmaddubsw m5, m4, m6
31452pmulhrsw m5, m7
31453packuswb m3, m5
31454movu [r0 + 2027 * 16], m3
31455
31456; mode 32 [row 27]
31457movu m6, [r5 + 12 * 16]
31458movu m0, [r3 + 19]
31459movd m1, [r3 + 20]
31460palignr m1, m0, 1
31461punpcklbw m0, m1
31462pmaddubsw m3, m0, m6
31463pmulhrsw m3, m7
31464movu m2, [r3 + 27]
31465movd m4, [r3 + 28]
31466palignr m4, m2, 1
31467punpcklbw m2, m4
31468pmaddubsw m5, m2, m6
31469pmulhrsw m5, m7
31470packuswb m3, m5
31471movu [r0 + 1974 * 16], m3
31472
31473movu m1, [r3 + 35]
31474movd m3, [r3 + 36]
31475palignr m3, m1, 1
31476punpcklbw m1, m3
31477pmaddubsw m3, m1, m6
31478pmulhrsw m3, m7
31479movu m4, [r3 + 43]
31480movd m5, [r3 + 44]
31481palignr m5, m4, 1
31482punpcklbw m4, m5
31483pmaddubsw m5, m4, m6
31484pmulhrsw m5, m7
31485packuswb m3, m5
31486movu [r0 + 1975 * 16], m3
31487
31488; mode 33 [row 22]
31489movu m6, [r5 + 22 * 16]
31490pmaddubsw m3, m0, m6
31491pmulhrsw m3, m7
31492pmaddubsw m5, m2, m6
31493pmulhrsw m5, m7
31494packuswb m3, m5
31495movu [r0 + 2028 * 16], m3
31496pmaddubsw m3, m1, m6
31497pmulhrsw m3, m7
31498pmaddubsw m5, m4, m6
31499pmulhrsw m5, m7
31500packuswb m3, m5
31501movu [r0 + 2029 * 16], m3
31502
31503; mode 32 [row 28]
31504movu m6, [r5 + 1 * 16]
31505movu m0, [r3 + 20]
31506movd m1, [r3 + 21]
31507palignr m1, m0, 1
31508punpcklbw m0, m1
31509pmaddubsw m3, m0, m6
31510pmulhrsw m3, m7
31511movu m2, [r3 + 28]
31512movd m4, [r3 + 29]
31513palignr m4, m2, 1
31514punpcklbw m2, m4
31515pmaddubsw m5, m2, m6
31516pmulhrsw m5, m7
31517packuswb m3, m5
31518movu [r0 + 1976 * 16], m3
31519
31520movu m1, [r3 + 36]
31521movd m3, [r3 + 37]
31522palignr m3, m1, 1
31523punpcklbw m1, m3
31524pmaddubsw m3, m1, m6
31525pmulhrsw m3, m7
31526movu m4, [r3 + 44]
31527movd m5, [r3 + 45]
31528palignr m5, m4, 1
31529punpcklbw m4, m5
31530pmaddubsw m5, m4, m6
31531pmulhrsw m5, m7
31532packuswb m3, m5
31533movu [r0 + 1977 * 16], m3
31534
31535; mode 32 [row 29]
31536movu m6, [r5 + 22 * 16]
31537pmaddubsw m3, m0, m6
31538pmulhrsw m3, m7
31539pmaddubsw m5, m2, m6
31540pmulhrsw m5, m7
31541packuswb m3, m5
31542movu [r0 + 1978 * 16], m3
31543pmaddubsw m3, m1, m6
31544pmulhrsw m3, m7
31545pmaddubsw m5, m4, m6
31546pmulhrsw m5, m7
31547packuswb m3, m5
31548movu [r0 + 1979 * 16], m3
31549
31550; mode 33 [row 23]
31551movu m6, [r5 + 16 * 16]
31552pmaddubsw m3, m0, m6
31553pmulhrsw m3, m7
31554pmaddubsw m5, m2, m6
31555pmulhrsw m5, m7
31556packuswb m3, m5
31557movu [r0 + 2030 * 16], m3
31558pmaddubsw m3, m1, m6
31559pmulhrsw m3, m7
31560pmaddubsw m5, m4, m6
31561pmulhrsw m5, m7
31562packuswb m3, m5
31563movu [r0 + 2031 * 16], m3
31564
31565; mode 32 [row 30]
31566movu m6, [r5 + 11 * 16]
31567movu m0, [r3 + 21]
31568movd m1, [r3 + 22]
31569palignr m1, m0, 1
31570punpcklbw m0, m1
31571pmaddubsw m3, m0, m6
31572pmulhrsw m3, m7
31573movu m2, [r3 + 29]
31574movd m4, [r3 + 30]
31575palignr m4, m2, 1
31576punpcklbw m2, m4
31577pmaddubsw m5, m2, m6
31578pmulhrsw m5, m7
31579packuswb m3, m5
31580movu [r0 + 1980 * 16], m3
31581
31582movu m1, [r3 + 37]
31583movd m3, [r3 + 38]
31584palignr m3, m1, 1
31585punpcklbw m1, m3
31586pmaddubsw m3, m1, m6
31587pmulhrsw m3, m7
31588movu m4, [r3 + 45]
31589movd m5, [r3 + 46]
31590palignr m5, m4, 1
31591punpcklbw m4, m5
31592pmaddubsw m5, m4, m6
31593pmulhrsw m5, m7
31594packuswb m3, m5
31595movu [r0 + 1981 * 16], m3
31596
31597; mode 33 [row 24]
31598movu m6, [r5 + 10 * 16]
31599pmaddubsw m3, m0, m6
31600pmulhrsw m3, m7
31601pmaddubsw m5, m2, m6
31602pmulhrsw m5, m7
31603packuswb m3, m5
31604movu [r0 + 2032 * 16], m3
31605pmaddubsw m3, m1, m6
31606pmulhrsw m3, m7
31607pmaddubsw m5, m4, m6
31608pmulhrsw m5, m7
31609packuswb m3, m5
31610movu [r0 + 2033 * 16], m3
31611
31612; mode 32 [row 31]
31613movu m0, [r3 + 22]
31614movd m1, [r3 + 23]
31615palignr m1, m0, 1
31616punpcklbw m0, m1
31617movu m2, [r3 + 30]
31618movd m3, [r3 + 31]
31619palignr m3, m2, 1
31620punpcklbw m2, m3
31621movu m1, [r3 + 38]
31622movd m3, [r3 + 39]
31623palignr m3, m1, 1
31624punpcklbw m1, m3
31625movu m4, [r3 + 46]
31626movd m5, [r3 + 47]
31627palignr m5, m4, 1
31628punpcklbw m4, m5
31629
31630pshufb m5, m0, [tab_S2]
31631movh [r0 + 1982 * 16], m5
31632pshufb m5, m2, [tab_S2]
31633movh [r0 + 1982 * 16 + 8], m5
31634pshufb m5, m1, [tab_S2]
31635movh [r0 + 1983 * 16], m5
31636pshufb m5, m4, [tab_S2]
31637movh [r0 + 1983 * 16 + 8], m5
31638
31639; mode 33 [row 25]
31640movu m6, [r5 + 4 * 16]
31641pmaddubsw m3, m0, m6
31642pmulhrsw m3, m7
31643pmaddubsw m5, m2, m6
31644pmulhrsw m5, m7
31645packuswb m3, m5
31646movu [r0 + 2034 * 16], m3
31647pmaddubsw m3, m1, m6
31648pmulhrsw m3, m7
31649pmaddubsw m5, m4, m6
31650pmulhrsw m5, m7
31651packuswb m3, m5
31652movu [r0 + 2035 * 16], m3
31653
31654; mode 33 [row 26]
31655movu m6, [r5 + 30 * 16]
31656pmaddubsw m3, m0, m6
31657pmulhrsw m3, m7
31658pmaddubsw m5, m2, m6
31659pmulhrsw m5, m7
31660packuswb m3, m5
31661movu [r0 + 2036 * 16], m3
31662pmaddubsw m3, m1, m6
31663pmulhrsw m3, m7
31664pmaddubsw m5, m4, m6
31665pmulhrsw m5, m7
31666packuswb m3, m5
31667movu [r0 + 2037 * 16], m3
31668
31669; mode 33 [row 27]
31670movu m6, [r5 + 24 * 16]
31671movu m0, [r3 + 23]
31672movd m1, [r3 + 24]
31673palignr m1, m0, 1
31674punpcklbw m0, m1
31675pmaddubsw m3, m0, m6
31676pmulhrsw m3, m7
31677movu m2, [r3 + 31]
31678movd m4, [r3 + 32]
31679palignr m4, m2, 1
31680punpcklbw m2, m4
31681pmaddubsw m5, m2, m6
31682pmulhrsw m5, m7
31683packuswb m3, m5
31684movu [r0 + 2038 * 16], m3
31685
31686movu m1, [r3 + 39]
31687movd m3, [r3 + 40]
31688palignr m3, m1, 1
31689punpcklbw m1, m3
31690pmaddubsw m3, m1, m6
31691pmulhrsw m3, m7
31692movu m4, [r3 + 47]
31693movd m5, [r3 + 48]
31694palignr m5, m4, 1
31695punpcklbw m4, m5
31696pmaddubsw m5, m4, m6
31697pmulhrsw m5, m7
31698packuswb m3, m5
31699movu [r0 + 2039 * 16], m3
31700
31701; mode 33 [row 28]
31702movu m6, [r5 + 18 * 16]
31703movu m0, [r3 + 24]
31704movd m1, [r3 + 25]
31705palignr m1, m0, 1
31706punpcklbw m0, m1
31707pmaddubsw m3, m0, m6
31708pmulhrsw m3, m7
31709movu m2, [r3 + 32]
31710movd m4, [r3 + 33]
31711palignr m4, m2, 1
31712punpcklbw m2, m4
31713pmaddubsw m5, m2, m6
31714pmulhrsw m5, m7
31715packuswb m3, m5
31716movu [r0 + 2040 * 16], m3
31717
31718movu m1, [r3 + 40]
31719movd m3, [r3 + 41]
31720palignr m3, m1, 1
31721punpcklbw m1, m3
31722pmaddubsw m3, m1, m6
31723pmulhrsw m3, m7
31724movu m4, [r3 + 48]
31725movd m5, [r3 + 49]
31726palignr m5, m4, 1
31727punpcklbw m4, m5
31728pmaddubsw m5, m4, m6
31729pmulhrsw m5, m7
31730packuswb m3, m5
31731movu [r0 + 2041 * 16], m3
31732
31733; mode 33 [row 29]
31734movu m6, [r5 + 12 * 16]
31735movu m0, [r3 + 25]
31736movd m1, [r3 + 26]
31737palignr m1, m0, 1
31738punpcklbw m0, m1
31739pmaddubsw m3, m0, m6
31740pmulhrsw m3, m7
31741movu m2, [r3 + 33]
31742movd m4, [r3 + 34]
31743palignr m4, m2, 1
31744punpcklbw m2, m4
31745pmaddubsw m5, m2, m6
31746pmulhrsw m5, m7
31747packuswb m3, m5
31748movu [r0 + 2042 * 16], m3
31749
31750movu m1, [r3 + 41]
31751movd m3, [r3 + 42]
31752palignr m3, m1, 1
31753punpcklbw m1, m3
31754pmaddubsw m3, m1, m6
31755pmulhrsw m3, m7
31756movu m4, [r3 + 49]
31757movd m5, [r3 + 50]
31758palignr m5, m4, 1
31759punpcklbw m4, m5
31760pmaddubsw m5, m4, m6
31761pmulhrsw m5, m7
31762packuswb m3, m5
31763movu [r0 + 2043 * 16], m3
31764
31765; mode 33 [row 30]
31766movu m6, [r5 + 6 * 16]
31767movu m0, [r3 + 26]
31768movd m1, [r3 + 27]
31769palignr m1, m0, 1
31770punpcklbw m0, m1
31771pmaddubsw m3, m0, m6
31772pmulhrsw m3, m7
31773movu m2, [r3 + 34]
31774movd m4, [r3 + 35]
31775palignr m4, m2, 1
31776punpcklbw m2, m4
31777pmaddubsw m5, m2, m6
31778pmulhrsw m5, m7
31779packuswb m3, m5
31780movu [r0 + 2044 * 16], m3
31781
31782movu m1, [r3 + 42]
31783movd m3, [r3 + 43]
31784palignr m3, m1, 1
31785punpcklbw m1, m3
31786pmaddubsw m3, m1, m6
31787pmulhrsw m3, m7
31788movu m4, [r3 + 50]
31789movd m5, [r3 + 51]
31790palignr m5, m4, 1
31791punpcklbw m4, m5
31792pmaddubsw m5, m4, m6
31793pmulhrsw m5, m7
31794packuswb m3, m5
31795movu [r0 + 2045 * 16], m3
31796
31797; mode 33 [row 31]
31798movu m5, [r3 + 27]
31799movu [r0 + 2046 * 16], m5
31800movu m5, [r3 + 43]
31801movu [r0 + 2047 * 16], m5
31802
31803;mode 34 [row 0]
31804movu m0, [r3 + 2]
31805movu [r0 + 2048 * 16], m0
31806movu m1, [r3 + 18]
31807movu [r0 + 2049 * 16], m1
31808
31809;mode 34 [row 1]
31810movu m2, [r3 + 34]
31811palignr m3, m1, m0, 1
31812movu [r0 + 2050 * 16], m3
31813palignr m4, m2, m1, 1
31814movu [r0 + 2051 * 16], m4
31815
31816;mode 34 [row 2]
31817palignr m3, m1, m0, 2
31818movu [r0 + 2052 * 16], m3
31819palignr m4, m2, m1, 2
31820movu [r0 + 2053 * 16], m4
31821
31822;mode 34 [row 3]
31823palignr m3, m1, m0, 3
31824movu [r0 + 2054 * 16], m3
31825palignr m4, m2, m1, 3
31826movu [r0 + 2055 * 16], m4
31827
31828;mode 34 [row 4]
31829palignr m3, m1, m0, 4
31830movu [r0 + 2056 * 16], m3
31831palignr m4, m2, m1, 4
31832movu [r0 + 2057 * 16], m4
31833
31834;mode 34 [row 5]
31835palignr m3, m1, m0, 5
31836movu [r0 + 2058 * 16], m3
31837palignr m4, m2, m1, 5
31838movu [r0 + 2059 * 16], m4
31839
31840;mode 34 [row 6]
31841palignr m3, m1, m0, 6
31842movu [r0 + 2060 * 16], m3
31843palignr m4, m2, m1, 6
31844movu [r0 + 2061 * 16], m4
31845
31846;mode 34 [row 7]
31847palignr m3, m1, m0, 7
31848movu [r0 + 2062 * 16], m3
31849palignr m4, m2, m1, 7
31850movu [r0 + 2063 * 16], m4
31851
31852;mode 34 [row 8]
31853palignr m3, m1, m0, 8
31854movu [r0 + 2064 * 16], m3
31855palignr m4, m2, m1, 8
31856movu [r0 + 2065 * 16], m4
31857
31858;mode 34 [row 9]
31859palignr m3, m1, m0, 9
31860movu [r0 + 2066 * 16], m3
31861palignr m4, m2, m1, 9
31862movu [r0 + 2067 * 16], m4
31863
31864;mode 34 [row 10]
31865palignr m3, m1, m0, 10
31866movu [r0 + 2068 * 16], m3
31867palignr m4, m2, m1, 10
31868movu [r0 + 2069 * 16], m4
31869
31870;mode 34 [row 11]
31871palignr m3, m1, m0, 11
31872movu [r0 + 2070 * 16], m3
31873palignr m4, m2, m1, 11
31874movu [r0 + 2071 * 16], m4
31875
31876;mode 34 [row 12]
31877palignr m3, m1, m0, 12
31878movu [r0 + 2072 * 16], m3
31879palignr m4, m2, m1, 12
31880movu [r0 + 2073 * 16], m4
31881
31882;mode 34 [row 13]
31883palignr m3, m1, m0, 13
31884movu [r0 + 2074 * 16], m3
31885palignr m4, m2, m1, 13
31886movu [r0 + 2075 * 16], m4
31887
31888;mode 34 [row 14]
31889palignr m3, m1, m0, 14
31890movu [r0 + 2076 * 16], m3
31891palignr m4, m2, m1, 14
31892movu [r0 + 2077 * 16], m4
31893
31894;mode 34 [row 15]
31895palignr m3, m1, m0, 15
31896movu [r0 + 2078 * 16], m3
31897palignr m4, m2, m1, 15
31898movu [r0 + 2079 * 16], m4
31899
31900;mode 34 [row 16]
31901palignr m3, m1, m0, 16
31902movu [r0 + 2080 * 16], m3
31903palignr m4, m2, m1, 16
31904movu [r0 + 2081 * 16], m4
31905
31906;mode 34 [row 17]
31907movu m0, [r3 + 19]
31908movu [r0 + 2082 * 16], m0
31909movu m1, [r3 + 35]
31910movu [r0 + 2083 * 16], m1
31911
31912;mode 34 [row 18]
31913movu m2, [r3 + 51]
31914palignr m3, m1, m0, 1
31915movu [r0 + 2084 * 16], m3
31916palignr m4, m2, m1, 1
31917movu [r0 + 2085 * 16], m4
31918
31919;mode 34 [row 19]
31920palignr m3, m1, m0, 2
31921movu [r0 + 2086 * 16], m3
31922palignr m4, m2, m1, 2
31923movu [r0 + 2087 * 16], m4
31924
31925;mode 34 [row 20]
31926palignr m3, m1, m0, 3
31927movu [r0 + 2088 * 16], m3
31928palignr m4, m2, m1, 3
31929movu [r0 + 2089 * 16], m4
31930
31931;mode 34 [row 21]
31932palignr m3, m1, m0, 4
31933movu [r0 + 2090 * 16], m3
31934palignr m4, m2, m1, 4
31935movu [r0 + 2091 * 16], m4
31936
31937;mode 34 [row 22]
31938palignr m3, m1, m0, 5
31939movu [r0 + 2092 * 16], m3
31940palignr m4, m2, m1, 5
31941movu [r0 + 2093 * 16], m4
31942
31943;mode 34 [row 23]
31944palignr m3, m1, m0, 6
31945movu [r0 + 2094 * 16], m3
31946palignr m4, m2, m1, 6
31947movu [r0 + 2095 * 16], m4
31948
31949;mode 34 [row 24]
31950palignr m3, m1, m0, 7
31951movu [r0 + 2096 * 16], m3
31952palignr m4, m2, m1, 7
31953movu [r0 + 2097 * 16], m4
31954
31955;mode 34 [row 25]
31956palignr m3, m1, m0, 8
31957movu [r0 + 2098 * 16], m3
31958palignr m4, m2, m1, 8
31959movu [r0 + 2099 * 16], m4
31960
31961;mode 34 [row 26]
31962palignr m3, m1, m0, 9
31963movu [r0 + 2100 * 16], m3
31964palignr m4, m2, m1, 9
31965movu [r0 + 2101 * 16], m4
31966
31967;mode 34 [row 27]
31968palignr m3, m1, m0, 10
31969movu [r0 + 2102 * 16], m3
31970palignr m4, m2, m1, 10
31971movu [r0 + 2103 * 16], m4
31972
31973;mode 34 [row 28]
31974palignr m3, m1, m0, 11
31975movu [r0 + 2104 * 16], m3
31976palignr m4, m2, m1, 11
31977movu [r0 + 2105 * 16], m4
31978
31979;mode 34 [row 29]
31980palignr m3, m1, m0, 12
31981movu [r0 + 2106 * 16], m3
31982palignr m4, m2, m1, 12
31983movu [r0 + 2107 * 16], m4
31984
31985;mode 34 [row 30]
31986palignr m3, m1, m0, 13
31987movu [r0 + 2108 * 16], m3
31988palignr m4, m2, m1, 13
31989movu [r0 + 2109 * 16], m4
31990
31991;mode 34 [row 31]
31992palignr m3, m1, m0, 14
31993movu [r0 + 2110 * 16], m3
31994palignr m4, m2, m1, 14
31995movu [r0 + 2111 * 16], m4
31996
31997RET