Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / intrapred8.asm
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5 ;*
6 ;* This program is free software; you can redistribute it and/or modify
7 ;* it under the terms of the GNU General Public License as published by
8 ;* the Free Software Foundation; either version 2 of the License, or
9 ;* (at your option) any later version.
10 ;*
11 ;* This program is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;* GNU General Public License for more details.
15 ;*
16 ;* You should have received a copy of the GNU General Public License
17 ;* along with this program; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 ;*
20 ;* This program is also available under a commercial proprietary license.
21 ;* For more information, contact us at license @ x265.com.
22 ;*****************************************************************************/
23
24 %include "x86inc.asm"
25 %include "x86util.asm"
26
27 SECTION_RODATA 32
28
29 pb_0_8 times 8 db 0, 8
30 pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
31 pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0
32 c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
33 tab_Si: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
34 pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
35 c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0
36 c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
37 c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0
38 c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15
39 c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
40 c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0
41 c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0
42 c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
43 c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
44 c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
45 c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
46 tab_S1: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
47 pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
48 c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
49 c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
50 c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
51 c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2
52 c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2
53 c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1
54 c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
55 tab_S2: db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
56
57 const ang_table
58 %assign x 0
59 %rep 32
60 times 8 db (32-x), x
61 %assign x x+1
62 %endrep
63
64 SECTION .text
65
66 cextern pw_8
67 cextern pw_1024
68 cextern pb_unpackbd1
69 cextern multiL
70 cextern multiH
71 cextern multiH2
72 cextern multiH3
73 cextern multi_2Row
74
75 ;-----------------------------------------------------------------------------
76 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
77 ;-----------------------------------------------------------------------------
78 INIT_XMM sse4
79 cglobal intra_pred_dc4, 4,6,3
80 mov r4d, r5m
81 inc r2
82 inc r3
83 pxor m0, m0
84 movd m1, [r2]
85 movd m2, [r3]
86 punpckldq m1, m2
87 psadbw m1, m0 ; m1 = sum
88
89 test r4d, r4d
90
91 mov r4d, 4096
92 movd m2, r4d
93 pmulhrsw m1, m2 ; m1 = (sum + 4) / 8
94 movd r4d, m1 ; r4d = dc_val
95 pshufb m1, m0 ; m1 = byte [dc_val ...]
96
97 ; store DC 4x4
98 lea r5, [r1 * 3]
99 movd [r0], m1
100 movd [r0 + r1], m1
101 movd [r0 + r1 * 2], m1
102 movd [r0 + r5], m1
103
104 ; do DC filter
105 jz .end
106 lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
107 add r4d, r5d ; r4d = DC * 3 + 2
108 movd m1, r4d
109 pshuflw m1, m1, 0 ; m1 = pixDCx3
110
111 ; filter top
112 pmovzxbw m2, [r3]
113 paddw m2, m1
114 psraw m2, 2
115 packuswb m2, m2
116 movd [r0], m2 ; overwrite top-left pixel, we will update it later
117
118 ; filter top-left
119 movzx r3d, byte [r3]
120 add r5d, r3d
121 movzx r3d, byte [r2]
122 add r3d, r5d
123 shr r3d, 2
124 mov [r0], r3b
125
126 ; filter left
127 add r0, r1
128 pmovzxbw m2, [r2 + 1]
129 paddw m2, m1
130 psraw m2, 2
131 packuswb m2, m2
132 pextrb [r0], m2, 0
133 pextrb [r0 + r1], m2, 1
134 pextrb [r0 + r1 * 2], m2, 2
135
136 .end:
137 RET
138
139
140 ;-------------------------------------------------------------------------------------------
141 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
142 ;-------------------------------------------------------------------------------------------
143 INIT_XMM sse4
144 cglobal intra_pred_dc8, 4, 7, 3
145 mov r4d, r5m
146 inc r2
147 inc r3
148 pxor m0, m0
149 movh m1, [r2]
150 movh m2, [r3]
151 punpcklqdq m1, m2
152 psadbw m1, m0
153 pshufd m2, m1, 2
154 paddw m1, m2
155
156 movd r5d, m1
157 add r5d, 8
158 shr r5d, 4 ; sum = sum / 16
159 movd m1, r5d
160 pshufb m1, m0 ; m1 = byte [dc_val ...]
161
162 test r4d, r4d
163
164 ; store DC 8x8
165 mov r6, r0
166 movh [r0], m1
167 movh [r0 + r1], m1
168 lea r0, [r0 + r1 * 2]
169 movh [r0], m1
170 movh [r0 + r1], m1
171 lea r0, [r0 + r1 * 2]
172 movh [r0], m1
173 movh [r0 + r1], m1
174 lea r0, [r0 + r1 * 2]
175 movh [r0], m1
176 movh [r0 + r1], m1
177
178 ; Do DC Filter
179 jz .end
180 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
181 add r5d, r4d ; r5d = DC * 3 + 2
182 movd m1, r5d
183 pshuflw m1, m1, 0 ; m1 = pixDCx3
184 pshufd m1, m1, 0
185
186 ; filter top
187 pmovzxbw m2, [r3]
188 paddw m2, m1
189 psraw m2, 2
190 packuswb m2, m2
191 movh [r6], m2
192
193 ; filter top-left
194 movzx r3d, byte [r3]
195 add r4d, r3d
196 movzx r3d, byte [r2]
197 add r3d, r4d
198 shr r3d, 2
199 mov [r6], r3b
200
201 ; filter left
202 add r6, r1
203 pmovzxbw m2, [r2 + 1]
204 paddw m2, m1
205 psraw m2, 2
206 packuswb m2, m2
207 pextrb [r6], m2, 0
208 pextrb [r6 + r1], m2, 1
209 pextrb [r6 + 2 * r1], m2, 2
210 lea r6, [r6 + r1 * 2]
211 pextrb [r6 + r1], m2, 3
212 pextrb [r6 + r1 * 2], m2, 4
213 pextrb [r6 + r1 * 4], m2, 6
214 lea r1, [r1 * 3]
215 pextrb [r6 + r1], m2, 5
216
217 .end:
218 RET
219
220 ;-------------------------------------------------------------------------------------------
221 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
222 ;-------------------------------------------------------------------------------------------
223 INIT_XMM sse4
224 cglobal intra_pred_dc16, 5, 7, 4
225 mov r4d, r5m
226 inc r2
227 inc r3
228 pxor m0, m0
229 movu m1, [r2]
230 movu m2, [r3]
231 psadbw m1, m0
232 psadbw m2, m0
233 paddw m1, m2
234 pshufd m2, m1, 2
235 paddw m1, m2
236
237 movd r5d, m1
238 add r5d, 16
239 shr r5d, 5 ; sum = sum / 32
240 movd m1, r5d
241 pshufb m1, m0 ; m1 = byte [dc_val ...]
242
243 test r4d, r4d
244
245 ; store DC 16x16
246 mov r6, r0
247 movu [r0], m1
248 movu [r0 + r1], m1
249 lea r0, [r0 + r1 * 2]
250 movu [r0], m1
251 movu [r0 + r1], m1
252 lea r0, [r0 + r1 * 2]
253 movu [r0], m1
254 movu [r0 + r1], m1
255 lea r0, [r0 + r1 * 2]
256 movu [r0], m1
257 movu [r0 + r1], m1
258 lea r0, [r0 + r1 * 2]
259 movu [r0], m1
260 movu [r0 + r1], m1
261 lea r0, [r0 + r1 * 2]
262 movu [r0], m1
263 movu [r0 + r1], m1
264 lea r0, [r0 + r1 * 2]
265 movu [r0], m1
266 movu [r0 + r1], m1
267 lea r0, [r0 + r1 * 2]
268 movu [r0], m1
269 movu [r0 + r1], m1
270
271 ; Do DC Filter
272 jz .end
273 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
274 add r5d, r4d ; r5d = DC * 3 + 2
275 movd m1, r5d
276 pshuflw m1, m1, 0 ; m1 = pixDCx3
277 pshufd m1, m1, 0
278
279 ; filter top
280 pmovzxbw m2, [r3]
281 paddw m2, m1
282 psraw m2, 2
283 packuswb m2, m2
284 movh [r6], m2
285 pmovzxbw m3, [r3 + 8]
286 paddw m3, m1
287 psraw m3, 2
288 packuswb m3, m3
289 movh [r6 + 8], m3
290
291 ; filter top-left
292 movzx r3d, byte [r3]
293 add r4d, r3d
294 movzx r3d, byte [r2]
295 add r3d, r4d
296 shr r3d, 2
297 mov [r6], r3b
298
299 ; filter left
300 add r6, r1
301 pmovzxbw m2, [r2 + 1]
302 paddw m2, m1
303 psraw m2, 2
304 packuswb m2, m2
305 pextrb [r6], m2, 0
306 pextrb [r6 + r1], m2, 1
307 pextrb [r6 + r1 * 2], m2, 2
308 lea r6, [r6 + r1 * 2]
309 pextrb [r6 + r1], m2, 3
310 pextrb [r6 + r1 * 2], m2, 4
311 lea r6, [r6 + r1 * 2]
312 pextrb [r6 + r1], m2, 5
313 pextrb [r6 + r1 * 2], m2, 6
314 lea r6, [r6 + r1 * 2]
315 pextrb [r6 + r1], m2, 7
316
317 pmovzxbw m3, [r2 + 9]
318 paddw m3, m1
319 psraw m3, 2
320 packuswb m3, m3
321 pextrb [r6 + r1 * 2], m3, 0
322 lea r6, [r6 + r1 * 2]
323 pextrb [r6 + r1], m3, 1
324 pextrb [r6 + r1 * 2], m3, 2
325 lea r6, [r6 + r1 * 2]
326 pextrb [r6 + r1], m3, 3
327 pextrb [r6 + r1 * 2], m3, 4
328 lea r6, [r6 + r1 * 2]
329 pextrb [r6 + r1], m3, 5
330 pextrb [r6 + r1 * 2], m3, 6
331
332 .end:
333 RET
334
335 ;-------------------------------------------------------------------------------------------
336 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
337 ;-------------------------------------------------------------------------------------------
338 INIT_XMM sse4
339 cglobal intra_pred_dc32, 4, 5, 5
340 inc r2
341 inc r3
342 pxor m0, m0
343 movu m1, [r2]
344 movu m2, [r2 + 16]
345 movu m3, [r3]
346 movu m4, [r3 + 16]
347 psadbw m1, m0
348 psadbw m2, m0
349 psadbw m3, m0
350 psadbw m4, m0
351 paddw m1, m2
352 paddw m3, m4
353 paddw m1, m3
354 pshufd m2, m1, 2
355 paddw m1, m2
356
357 movd r4d, m1
358 add r4d, 32
359 shr r4d, 6 ; sum = sum / 64
360 movd m1, r4d
361 pshufb m1, m0 ; m1 = byte [dc_val ...]
362
363 %rep 2
364 ; store DC 16x16
365 movu [r0], m1
366 movu [r0 + r1], m1
367 movu [r0 + 16], m1
368 movu [r0 + r1 + 16],m1
369 lea r0, [r0 + 2 * r1]
370 movu [r0], m1
371 movu [r0 + r1], m1
372 movu [r0 + 16], m1
373 movu [r0 + r1 + 16],m1
374 lea r0, [r0 + 2 * r1]
375 movu [r0], m1
376 movu [r0 + r1], m1
377 movu [r0 + 16], m1
378 movu [r0 + r1 + 16],m1
379 lea r0, [r0 + 2 * r1]
380 movu [r0], m1
381 movu [r0 + r1], m1
382 movu [r0 + 16], m1
383 movu [r0 + r1 + 16],m1
384 lea r0, [r0 + 2 * r1]
385 movu [r0], m1
386 movu [r0 + r1], m1
387 movu [r0 + 16], m1
388 movu [r0 + r1 + 16],m1
389 lea r0, [r0 + 2 * r1]
390 movu [r0], m1
391 movu [r0 + r1], m1
392 movu [r0 + 16], m1
393 movu [r0 + r1 + 16],m1
394 lea r0, [r0 + 2 * r1]
395 movu [r0], m1
396 movu [r0 + r1], m1
397 movu [r0 + 16], m1
398 movu [r0 + r1 + 16],m1
399 lea r0, [r0 + 2 * r1]
400 movu [r0], m1
401 movu [r0 + r1], m1
402 movu [r0 + 16], m1
403 movu [r0 + r1 + 16],m1
404 lea r0, [r0 + 2 * r1]
405 %endrep
406
407 RET
408
409 ;-----------------------------------------------------------------------------------------------------------
410 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
411 ;-----------------------------------------------------------------------------------------------------------
412 INIT_XMM sse4
413 cglobal intra_pred_planar4, 4,7,5
414 inc r2
415 inc r3
416 pmovzxbw m0, [r3] ; topRow[i] = above[i];
417 punpcklqdq m0, m0
418
419 pxor m1, m1
420 movd m2, [r2 + 4] ; bottomLeft = left[4]
421 movzx r6d, byte [r3 + 4] ; topRight = above[4];
422 pshufb m2, m1
423 punpcklbw m2, m1
424 psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i]
425 psllw m0, 2
426 punpcklqdq m3, m2, m1
427 psubw m0, m3
428 paddw m2, m2
429
430 %macro COMP_PRED_PLANAR_2ROW 1
431 movzx r4d, byte [r2 + %1]
432 lea r4d, [r4d * 4 + 4]
433 movd m3, r4d
434 pshuflw m3, m3, 0
435
436 movzx r4d, byte [r2 + %1 + 1]
437 lea r4d, [r4d * 4 + 4]
438 movd m4, r4d
439 pshuflw m4, m4, 0
440 punpcklqdq m3, m4 ; horPred
441
442 movzx r4d, byte [r2 + %1]
443 mov r5d, r6d
444 sub r5d, r4d
445 movd m4, r5d
446 pshuflw m4, m4, 0
447
448 movzx r4d, byte [r2 + %1 + 1]
449 mov r5d, r6d
450 sub r5d, r4d
451 movd m1, r5d
452 pshuflw m1, m1, 0
453 punpcklqdq m4, m1 ; rightColumnN
454
455 pmullw m4, [multi_2Row]
456 paddw m3, m4
457 paddw m0, m2
458 paddw m3, m0
459 psraw m3, 3
460 packuswb m3, m3
461
462 movd [r0], m3
463 pshufd m3, m3, 0x55
464 movd [r0 + r1], m3
465 lea r0, [r0 + 2 * r1]
466 %endmacro
467
468 COMP_PRED_PLANAR_2ROW 0
469 COMP_PRED_PLANAR_2ROW 2
470
471 RET
472
473 ;-----------------------------------------------------------------------------------------------------------
474 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
475 ;-----------------------------------------------------------------------------------------------------------
476 INIT_XMM sse4
477 cglobal intra_pred_planar8, 4,4,7
478 inc r2
479 inc r3
480 pxor m0, m0
481 pmovzxbw m1, [r3] ; v_topRow
482 pmovzxbw m2, [r2] ; v_leftColumn
483
484 movd m3, [r3 + 8] ; topRight = above[8];
485 movd m4, [r2 + 8] ; bottomLeft = left[8];
486
487 pshufb m3, m0
488 pshufb m4, m0
489 punpcklbw m3, m0 ; v_topRight
490 punpcklbw m4, m0 ; v_bottomLeft
491
492 psubw m4, m1 ; v_bottomRow
493 psubw m3, m2 ; v_rightColumn
494
495 psllw m1, 3 ; v_topRow
496 psllw m2, 3 ; v_leftColumn
497
498 paddw m6, m2, [pw_8]
499
500 %macro PRED_PLANAR_ROW8 1
501 %if (%1 < 4)
502 pshuflw m5, m6, 0x55 * %1
503 pshufd m5, m5, 0
504 pshuflw m2, m3, 0x55 * %1
505 pshufd m2, m2, 0
506 %else
507 pshufhw m5, m6, 0x55 * (%1 - 4)
508 pshufd m5, m5, 0xAA
509 pshufhw m2, m3, 0x55 * (%1 - 4)
510 pshufd m2, m2, 0xAA
511 %endif
512
513 pmullw m2, [multiL]
514 paddw m5, m2
515 paddw m1, m4
516 paddw m5, m1
517 psraw m5, 4
518 packuswb m5, m5
519
520 movh [r0], m5
521 lea r0, [r0 + r1]
522
523 %endmacro
524
525 PRED_PLANAR_ROW8 0
526 PRED_PLANAR_ROW8 1
527 PRED_PLANAR_ROW8 2
528 PRED_PLANAR_ROW8 3
529 PRED_PLANAR_ROW8 4
530 PRED_PLANAR_ROW8 5
531 PRED_PLANAR_ROW8 6
532 PRED_PLANAR_ROW8 7
533
534 RET
535
536
537 ;-----------------------------------------------------------------------------------------------------------
538 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
539 ;-----------------------------------------------------------------------------------------------------------
540 INIT_XMM sse4
541 cglobal intra_pred_planar16, 4,6,8
542 inc r2
543 inc r3
544 pxor m0, m0
545 pmovzxbw m1, [r3] ; topRow[0-7]
546 pmovzxbw m2, [r3 + 8] ; topRow[8-15]
547
548 movd m3, [r2 + 16]
549 pshufb m3, m0
550 punpcklbw m3, m0 ; v_bottomLeft = left[16]
551 movzx r4d, byte [r3 + 16] ; topRight = above[16]
552
553 psubw m4, m3, m1 ; v_bottomRow[0]
554 psubw m5, m3, m2 ; v_bottomRow[1]
555
556 psllw m1, 4
557 psllw m2, 4
558
559 %macro PRED_PLANAR_ROW16 1
560 movzx r5d, byte [r2 + %1]
561 add r5d, r5d
562 lea r5d, [r5d * 8 + 16]
563 movd m3, r5d
564 pshuflw m3, m3, 0
565 pshufd m3, m3, 0 ; horPred
566
567 movzx r5d, byte [r2 + %1]
568 mov r3d, r4d
569 sub r3d, r5d
570 movd m6, r3d
571 pshuflw m6, m6, 0
572 pshufd m6, m6, 0
573
574 pmullw m7, m6, [multiL]
575 paddw m7, m3
576 paddw m1, m4
577 paddw m7, m1
578 psraw m7, 5
579
580 pmullw m6, m6, [multiH]
581 paddw m3, m6
582 paddw m2, m5
583 paddw m3, m2
584 psraw m3, 5
585
586 packuswb m7, m3
587 movu [r0], m7
588 lea r0, [r0 + r1]
589 %endmacro
590
591 PRED_PLANAR_ROW16 0
592 PRED_PLANAR_ROW16 1
593 PRED_PLANAR_ROW16 2
594 PRED_PLANAR_ROW16 3
595 PRED_PLANAR_ROW16 4
596 PRED_PLANAR_ROW16 5
597 PRED_PLANAR_ROW16 6
598 PRED_PLANAR_ROW16 7
599 PRED_PLANAR_ROW16 8
600 PRED_PLANAR_ROW16 9
601 PRED_PLANAR_ROW16 10
602 PRED_PLANAR_ROW16 11
603 PRED_PLANAR_ROW16 12
604 PRED_PLANAR_ROW16 13
605 PRED_PLANAR_ROW16 14
606 PRED_PLANAR_ROW16 15
607
608 RET
609
610
611 ;-----------------------------------------------------------------------------------------------------------
612 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
613 ;-----------------------------------------------------------------------------------------------------------
614 INIT_XMM sse4
615 %if ARCH_X86_64 == 1
616 cglobal intra_pred_planar32, 4,7,12
617 %define bottomRow0 m8
618 %define bottomRow1 m9
619 %define bottomRow2 m10
620 %define bottomRow3 m11
621 %else
622 cglobal intra_pred_planar32, 4,7,8,0-(4*mmsize)
623 %define bottomRow0 [rsp + 0 * mmsize]
624 %define bottomRow1 [rsp + 1 * mmsize]
625 %define bottomRow2 [rsp + 2 * mmsize]
626 %define bottomRow3 [rsp + 3 * mmsize]
627 %endif
628 inc r2
629 inc r3
630 pxor m3, m3
631 movd m0, [r2 + 32]
632 pshufb m0, m3
633 punpcklbw m0, m3 ; v_bottomLeft = left[32]
634 movzx r4d, byte [r3 + 32] ; topRight = above[32]
635
636 pmovzxbw m1, [r3 + 0] ; topRow[0]
637 pmovzxbw m2, [r3 + 8] ; topRow[1]
638 pmovzxbw m3, [r3 +16] ; topRow[2]
639 pmovzxbw m4, [r3 +24] ; topRow[3]
640
641 psubw m5, m0, m1 ; v_bottomRow[0]
642 psubw m6, m0, m2 ; v_bottomRow[1]
643 psubw m7, m0, m3 ; v_bottomRow[2]
644 psubw m0, m4 ; v_bottomRow[3]
645
646 mova bottomRow0, m5
647 mova bottomRow1, m6
648 mova bottomRow2, m7
649 mova bottomRow3, m0
650
651 psllw m1, 5
652 psllw m2, 5
653 psllw m3, 5
654 psllw m4, 5
655
656 %macro COMP_PRED_PLANAR_ROW 1
657 movzx r5d, byte [r2]
658 shl r5d, 5
659 add r5d, 32
660 movd m5, r5d
661 pshuflw m5, m5, 0
662 pshufd m5, m5, 0 ; horPred
663
664 movzx r5d, byte [r2]
665 mov r6d, r4d
666 sub r6d, r5d
667 movd m6, r6d
668 pshuflw m6, m6, 0
669 pshufd m6, m6, 0
670
671 %if (%1 == 0)
672 pmullw m7, m6, [multiL]
673 %else
674 pmullw m7, m6, [multiH2]
675 %endif
676
677 paddw m7, m5
678 %if (%1 == 0)
679 paddw m1, bottomRow0
680 paddw m7, m1
681 %else
682 paddw m3, bottomRow2
683 paddw m7, m3
684 %endif
685 psraw m7, 6
686
687 %if (%1 == 0)
688 pmullw m6, [multiH]
689 %else
690 pmullw m6, [multiH3]
691 %endif
692 paddw m6, m5
693 %if (%1 == 0)
694 paddw m2, bottomRow1
695 paddw m6, m2
696 %else
697 paddw m4, bottomRow3
698 paddw m6, m4
699 %endif
700 psraw m6, 6
701
702 packuswb m7, m6
703 movu [r0 + %1], m7
704 %endmacro
705
706 mov r3, 32
707 .loop:
708 COMP_PRED_PLANAR_ROW 0
709 COMP_PRED_PLANAR_ROW 16
710 inc r2
711 lea r0, [r0 + r1]
712
713 dec r3
714 jnz .loop
715 %undef COMP_PRED_PLANAR_ROW
716
717 RET
718
719 ;-----------------------------------------------------------------------------
720 ; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
721 ;-----------------------------------------------------------------------------
722 INIT_XMM ssse3
723 cglobal intra_pred_ang4_2, 3,3,4
724 cmp r4m, byte 34
725 cmove r2, r3mp
726 movh m0, [r2 + 2]
727 movd [r0], m0
728 palignr m1, m0, 1
729 movd [r0 + r1], m1
730 palignr m2, m0, 2
731 movd [r0 + r1 * 2], m2
732 lea r1, [r1 * 3]
733 psrldq m0, 3
734 movd [r0 + r1], m0
735 RET
736
737
738 INIT_XMM sse4
739 cglobal intra_pred_ang4_3, 3,4,5
740 cmp r4m, byte 33
741 cmove r2, r3mp
742 lea r3, [ang_table + 20 * 16]
743 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
744 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
745 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
746 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
747 palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
748 palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4]
749 punpcklqdq m0, m1
750 punpcklqdq m2, m3
751
752 movh m3, [r3 + 6 * 16] ; [26]
753 movhps m3, [r3] ; [20]
754 movh m4, [r3 - 6 * 16] ; [14]
755 movhps m4, [r3 - 12 * 16] ; [ 8]
756 jmp .do_filter4x4
757
758 ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
759 ALIGN 16
760 .do_filter4x4:
761 mova m1, [pw_1024]
762
763 pmaddubsw m0, m3
764 pmulhrsw m0, m1
765 pmaddubsw m2, m4
766 pmulhrsw m2, m1
767 packuswb m0, m2
768
769 ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before
770 jz .store
771
772 ; transpose 4x4
773 pshufb m0, [c_trans_4x4]
774
775 .store:
776 ; TODO: use pextrd here after intrinsic ssse3 removed
777 movd [r0], m0
778 pextrd [r0 + r1], m0, 1
779 pextrd [r0 + r1 * 2], m0, 2
780 lea r1, [r1 * 3]
781 pextrd [r0 + r1], m0, 3
782 RET
783
784
785 cglobal intra_pred_ang4_4, 3,4,5
786 cmp r4m, byte 32
787 cmove r2, r3mp
788 lea r3, [ang_table + 18 * 16]
789 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
790 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
791 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
792 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
793 palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
794 punpcklqdq m0, m1
795 punpcklqdq m2, m1, m3
796
797 movh m3, [r3 + 3 * 16] ; [21]
798 movhps m3, [r3 - 8 * 16] ; [10]
799 movh m4, [r3 + 13 * 16] ; [31]
800 movhps m4, [r3 + 2 * 16] ; [20]
801 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
802
803
804 cglobal intra_pred_ang4_5, 3,4,5
805 cmp r4m, byte 31
806 cmove r2, r3mp
807 lea r3, [ang_table + 10 * 16]
808 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
809 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
810 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
811 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
812 palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
813 punpcklqdq m0, m1
814 punpcklqdq m2, m1, m3
815
816 movh m3, [r3 + 7 * 16] ; [17]
817 movhps m3, [r3 - 8 * 16] ; [ 2]
818 movh m4, [r3 + 9 * 16] ; [19]
819 movhps m4, [r3 - 6 * 16] ; [ 4]
820 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
821
822
823 cglobal intra_pred_ang4_6, 3,4,5
824 cmp r4m, byte 30
825 cmove r2, r3mp
826 lea r3, [ang_table + 19 * 16]
827 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
828 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
829 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
830 palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
831 punpcklqdq m0, m0
832 punpcklqdq m2, m2
833
834 movh m3, [r3 - 6 * 16] ; [13]
835 movhps m3, [r3 + 7 * 16] ; [26]
836 movh m4, [r3 - 12 * 16] ; [ 7]
837 movhps m4, [r3 + 1 * 16] ; [20]
838 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
839
840
841 cglobal intra_pred_ang4_7, 3,4,5
842 cmp r4m, byte 29
843 cmove r2, r3mp
844 lea r3, [ang_table + 20 * 16]
845 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
846 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
847 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
848 palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
849 punpcklqdq m2, m0, m3
850 punpcklqdq m0, m0
851
852 movh m3, [r3 - 11 * 16] ; [ 9]
853 movhps m3, [r3 - 2 * 16] ; [18]
854 movh m4, [r3 + 7 * 16] ; [27]
855 movhps m4, [r3 - 16 * 16] ; [ 4]
856 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
857
858
859 cglobal intra_pred_ang4_8, 3,4,5
860 cmp r4m, byte 28
861 cmove r2, r3mp
862 lea r3, [ang_table + 13 * 16]
863 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
864 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
865 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
866 punpcklqdq m0, m0
867 mova m2, m0
868
869 movh m3, [r3 - 8 * 16] ; [ 5]
870 movhps m3, [r3 - 3 * 16] ; [10]
871 movh m4, [r3 + 2 * 16] ; [15]
872 movhps m4, [r3 + 7 * 16] ; [20]
873 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
874
875
876 cglobal intra_pred_ang4_9, 3,4,5
877 cmp r4m, byte 27
878 cmove r2, r3mp
879 lea r3, [ang_table + 4 * 16]
880 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
881 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
882 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
883 punpcklqdq m0, m0
884 mova m2, m0
885
886 movh m3, [r3 - 2 * 16] ; [ 2]
887 movhps m3, [r3 - 0 * 16] ; [ 4]
888 movh m4, [r3 + 2 * 16] ; [ 6]
889 movhps m4, [r3 + 4 * 16] ; [ 8]
890 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
891
892
893 cglobal intra_pred_ang4_10, 3,3,4
894 movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
895 pshufb m0, [pb_unpackbd1]
896
897 pshufd m1, m0, 1
898 movhlps m2, m0
899 pshufd m3, m0, 3
900 movd [r0 + r1], m1
901 movd [r0 + r1 * 2], m2
902 lea r1, [r1 * 3]
903 movd [r0 + r1], m3
904
905 cmp r5m, byte 0
906 jz .quit
907
908 ; filter
909 mov r2, r3mp
910 pmovzxbw m0, m0 ; [-1 -1 -1 -1]
911 movh m1, [r2] ; [4 3 2 1 0]
912 pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
913 pshufb m1, [pb_unpackbw1] ; [4 3 2 1]
914 psubw m1, m2
915 psraw m1, 1
916 paddw m0, m1
917 packuswb m0, m0
918
919 .quit:
920 movd [r0], m0
921 RET
922
923
924 INIT_XMM sse4
925 cglobal intra_pred_ang4_26, 4,4,3
926 movd m0, [r3 + 1] ; [8 7 6 5 4 3 2 1]
927
928 ; store
929 movd [r0], m0
930 movd [r0 + r1], m0
931 movd [r0 + r1 * 2], m0
932 lea r3, [r1 * 3]
933 movd [r0 + r3], m0
934
935 ; filter
936 cmp r5m, byte 0
937 jz .quit
938
939 pshufb m0, [pb_0_8] ; [ 1 1 1 1]
940 movh m1, [r2] ; [-4 -3 -2 -1 0]
941 pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
942 pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1]
943 psubw m1, m2
944 psraw m1, 1
945 paddw m0, m1
946 packuswb m0, m0
947
948 pextrb [r0], m0, 0
949 pextrb [r0 + r1], m0, 1
950 pextrb [r0 + r1 * 2], m0, 2
951 pextrb [r0 + r3], m0, 3
952
953 .quit:
954 RET
955
956
957 cglobal intra_pred_ang4_11, 3,4,5
958 cmp r4m, byte 25
959 cmove r2, r3mp
960 lea r3, [ang_table + 24 * 16]
961 movh m0, [r2] ; [x x x 4 3 2 1 0]
962 palignr m1, m0, 1 ; [x x x x 4 3 2 1]
963 punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
964 punpcklqdq m0, m0
965 mova m2, m0
966
967 movh m3, [r3 + 6 * 16] ; [24]
968 movhps m3, [r3 + 4 * 16] ; [26]
969 movh m4, [r3 + 2 * 16] ; [28]
970 movhps m4, [r3 + 0 * 16] ; [30]
971 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
972
973
974 cglobal intra_pred_ang4_12, 3,4,5
975 cmp r4m, byte 24
976 cmove r2, r3mp
977 lea r3, [ang_table + 20 * 16]
978 movh m0, [r2] ; [x x x 4 3 2 1 0]
979 palignr m1, m0, 1 ; [x x x x 4 3 2 1]
980 punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
981 punpcklqdq m0, m0
982 mova m2, m0
983
984 movh m3, [r3 + 7 * 16] ; [27]
985 movhps m3, [r3 + 2 * 16] ; [22]
986 movh m4, [r3 - 3 * 16] ; [17]
987 movhps m4, [r3 - 8 * 16] ; [12]
988 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
989
990
991 cglobal intra_pred_ang4_13, 4,4,5
992 cmp r4m, byte 23
993 jnz .load
994 xchg r2, r3
995 .load:
996 movh m1, [r2 - 1] ; [x x 4 3 2 1 0 x]
997 palignr m0, m1, 1 ; [x x x 4 3 2 1 0]
998 palignr m2, m1, 2 ; [x x x x 4 3 2 1]
999 pinsrb m1, [r3 + 4], 0
1000 punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x]
1001 punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0]
1002 punpcklqdq m2, m0, m1
1003 punpcklqdq m0, m0
1004
1005 lea r3, [ang_table + 21 * 16]
1006 movh m3, [r3 + 2 * 16] ; [23]
1007 movhps m3, [r3 - 7 * 16] ; [14]
1008 movh m4, [r3 - 16 * 16] ; [ 5]
1009 movhps m4, [r3 + 7 * 16] ; [28]
1010 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1011
1012
1013 cglobal intra_pred_ang4_14, 4,4,5
1014 cmp r4m, byte 22
1015 jnz .load
1016 xchg r2, r3
1017 .load:
1018 movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
1019 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
1020 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1021 pinsrb m2, [r3 + 2], 0
1022 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
1023 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1024 punpcklqdq m0, m0
1025 punpcklqdq m2, m2
1026
1027 lea r3, [ang_table + 19 * 16]
1028 movh m3, [r3 + 0 * 16] ; [19]
1029 movhps m3, [r3 - 13 * 16] ; [ 6]
1030 movh m4, [r3 + 6 * 16] ; [25]
1031 movhps m4, [r3 - 7 * 16] ; [12]
1032 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1033
1034
1035 cglobal intra_pred_ang4_15, 4,4,5
1036 cmp r4m, byte 21
1037 jnz .load
1038 xchg r2, r3
1039 .load:
1040 movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
1041 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
1042 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1043 pinsrb m2, [r3 + 2], 0
1044 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
1045 pinsrb m3, [r3 + 4], 0
1046 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
1047 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
1048 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1049 punpcklqdq m0, m2
1050 punpcklqdq m2, m4
1051
1052 lea r3, [ang_table + 23 * 16]
1053 movh m3, [r3 - 8 * 16] ; [15]
1054 movhps m3, [r3 + 7 * 16] ; [30]
1055 movh m4, [r3 - 10 * 16] ; [13]
1056 movhps m4, [r3 + 5 * 16] ; [28]
1057 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1058
1059
1060 cglobal intra_pred_ang4_16, 4,4,5
1061 cmp r4m, byte 20
1062 jnz .load
1063 xchg r2, r3
1064 .load:
1065 movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
1066 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
1067 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1068 pinsrb m2, [r3 + 2], 0
1069 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
1070 pinsrb m3, [r3 + 3], 0
1071 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
1072 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
1073 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1074 punpcklqdq m0, m2
1075 punpcklqdq m2, m4
1076
1077 lea r3, [ang_table + 19 * 16]
1078 movh m3, [r3 - 8 * 16] ; [11]
1079 movhps m3, [r3 + 3 * 16] ; [22]
1080 movh m4, [r3 - 18 * 16] ; [ 1]
1081 movhps m4, [r3 - 7 * 16] ; [12]
1082 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1083
1084
1085 cglobal intra_pred_ang4_17, 4,4,5
1086 cmp r4m, byte 19
1087 jnz .load
1088 xchg r2, r3
1089 .load:
1090 movh m3, [r2 - 1] ; [- - 4 3 2 1 0 x]
1091 palignr m0, m3, 1 ; [- - - 4 3 2 1 0]
1092 palignr m1, m3, 2 ; [- - - - 4 3 2 1]
1093 mova m4, m0
1094 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1095
1096 pinsrb m3, [r3 + 1], 0
1097 punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x]
1098 punpcklqdq m0, m1
1099
1100 pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y]
1101 pinsrb m2, [r3 + 2], 0
1102 pslldq m1, m2, 1 ; [4 3 2 1 0 x y z]
1103 pinsrb m1, [r3 + 4], 0
1104 punpcklbw m1, m2 ; [1 0 0 x x y y z]
1105 punpcklbw m2, m3 ; [2 1 1 0 0 x x y]
1106 punpcklqdq m2, m1
1107
1108 lea r3, [ang_table + 14 * 16]
1109 movh m3, [r3 - 8 * 16] ; [ 6]
1110 movhps m3, [r3 - 2 * 16] ; [12]
1111 movh m4, [r3 + 4 * 16] ; [18]
1112 movhps m4, [r3 + 10 * 16] ; [24]
1113 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1114
1115
1116 cglobal intra_pred_ang4_18, 4,4,1
1117 mov r2d, [r2]
1118 bswap r2d
1119 movd m0, r2d
1120 pinsrd m0, [r3 + 1], 1 ; [- 3 2 1 0 -1 -2 -3]
1121 lea r2, [r1 * 3]
1122 movd [r0 + r2], m0
1123 psrldq m0, 1
1124 movd [r0 + r1 * 2], m0
1125 psrldq m0, 1
1126 movd [r0 + r1], m0
1127 psrldq m0, 1
1128 movd [r0], m0
1129 RET
1130 ;-----------------------------------------------------------------------------
1131 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
1132 ;-----------------------------------------------------------------------------
1133 INIT_XMM ssse3
1134 cglobal intra_pred_ang8_2, 3,5,2
1135 cmp r4m, byte 34
1136 cmove r2, r3mp
1137 movu m0, [r2 + 2]
1138 lea r4, [r1 * 3]
1139
1140 movh [r0], m0
1141 palignr m1, m0, 1
1142 movh [r0 + r1], m1
1143 palignr m1, m0, 2
1144 movh [r0 + r1 * 2], m1
1145 palignr m1, m0, 3
1146 movh [r0 + r4], m1
1147 palignr m1, m0, 4
1148 lea r0, [r0 + r1 * 4]
1149 movh [r0], m1
1150 palignr m1, m0, 5
1151 movh [r0 + r1], m1
1152 palignr m1, m0, 6
1153 movh [r0 + r1 * 2], m1
1154 palignr m1, m0, 7
1155 movh [r0 + r4], m1
1156 RET
1157
1158 INIT_XMM sse4
1159 cglobal intra_pred_ang8_3, 3,5,8
1160 cmp r4m, byte 33
1161 cmove r2, r3mp
1162 lea r3, [ang_table + 22 * 16]
1163 lea r4, [ang_table + 8 * 16]
1164 mova m3, [pw_1024]
1165
1166 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1167 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1168
1169 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1170 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1171 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1172
1173 pmaddubsw m4, m0, [r3 + 4 * 16] ; [26]
1174 pmulhrsw m4, m3
1175 pmaddubsw m1, [r3 - 2 * 16] ; [20]
1176 pmulhrsw m1, m3
1177 packuswb m4, m1
1178
1179 palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1180
1181 pmaddubsw m5, [r3 - 8 * 16] ; [14]
1182 pmulhrsw m5, m3
1183
1184 palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1185
1186 pmaddubsw m6, [r4] ; [ 8]
1187 pmulhrsw m6, m3
1188 packuswb m5, m6
1189
1190 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
1191
1192 pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2]
1193 pmulhrsw m6, m3
1194
1195 pmaddubsw m1, [r3 + 6 * 16] ; [28]
1196 pmulhrsw m1, m3
1197 packuswb m6, m1
1198
1199 palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
1200
1201 pmaddubsw m1, [r3] ; [22]
1202 pmulhrsw m1, m3
1203
1204 palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
1205
1206 pmaddubsw m2, [r3 - 6 * 16] ; [16]
1207 pmulhrsw m2, m3
1208 packuswb m1, m2
1209 jmp .transpose8x8
1210
1211 ALIGN 16
1212 .transpose8x8:
1213 jz .store
1214
1215 ; transpose 8x8
1216 punpckhbw m0, m4, m5
1217 punpcklbw m4, m5
1218 punpckhbw m2, m4, m0
1219 punpcklbw m4, m0
1220
1221 punpckhbw m0, m6, m1
1222 punpcklbw m6, m1
1223 punpckhbw m1, m6, m0
1224 punpcklbw m6, m0
1225
1226 punpckhdq m5, m4, m6
1227 punpckldq m4, m6
1228 punpckldq m6, m2, m1
1229 punpckhdq m2, m1
1230 mova m1, m2
1231
1232 .store:
1233 lea r4, [r1 * 3]
1234 movh [r0], m4
1235 movhps [r0 + r1], m4
1236 movh [r0 + r1 * 2], m5
1237 movhps [r0 + r4], m5
1238 add r0, r4
1239 movh [r0 + r1], m6
1240 movhps [r0 + r1 * 2], m6
1241 movh [r0 + r4], m1
1242 movhps [r0 + r1 * 4], m1
1243 RET
1244
1245 cglobal intra_pred_ang8_4, 3,5,8
1246 cmp r4m, byte 32
1247 cmove r2, r3mp
1248 lea r3, [ang_table + 24 * 16]
1249 lea r4, [ang_table + 10 * 16]
1250 mova m3, [pw_1024]
1251
1252 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1253 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1254
1255 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1256 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1257 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1258 mova m5, m1
1259
1260 pmaddubsw m4, m0, [r3 - 3 * 16] ; [21]
1261 pmulhrsw m4, m3
1262 pmaddubsw m1, [r4] ; [10]
1263 pmulhrsw m1, m3
1264 packuswb m4, m1
1265
1266 pmaddubsw m5, [r3 + 7 * 16] ; [31]
1267 pmulhrsw m5, m3
1268
1269 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1270
1271 pmaddubsw m6, [r3 - 4 * 16] ; [ 20]
1272 pmulhrsw m6, m3
1273 packuswb m5, m6
1274
1275 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1276
1277 pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9]
1278 pmulhrsw m6, m3
1279
1280 pmaddubsw m1, [r3 + 6 * 16] ; [30]
1281 pmulhrsw m1, m3
1282 packuswb m6, m1
1283
1284 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
1285
1286 pmaddubsw m1, [r3 - 5 * 16] ; [19]
1287 pmulhrsw m1, m3
1288
1289 palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8]
1290
1291 pmaddubsw m2, [r4 - 2 * 16] ; [8]
1292 pmulhrsw m2, m3
1293 packuswb m1, m2
1294 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1295
1296 cglobal intra_pred_ang8_5, 3,5,8
1297 cmp r4m, byte 31
1298 cmove r2, r3mp
1299 lea r3, [ang_table + 17 * 16]
1300 lea r4, [ang_table + 2 * 16]
1301 mova m3, [pw_1024]
1302
1303 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1304 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1305
1306 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1307 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1308 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1309 mova m5, m1
1310
1311 pmaddubsw m4, m0, [r3] ; [17]
1312 pmulhrsw m4, m3
1313 pmaddubsw m1, [r4] ; [2]
1314 pmulhrsw m1, m3
1315 packuswb m4, m1
1316
1317 pmaddubsw m5, [r3 + 2 * 16] ; [19]
1318 pmulhrsw m5, m3
1319
1320 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1321 mova m1, m6
1322
1323 pmaddubsw m1, [r4 + 2 * 16] ; [4]
1324 pmulhrsw m1, m3
1325 packuswb m5, m1
1326
1327 pmaddubsw m6, [r3 + 4 * 16] ; [21]
1328 pmulhrsw m6, m3
1329
1330 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1331
1332 mova m7, m1
1333 pmaddubsw m7, [r4 + 4 * 16] ; [6]
1334 pmulhrsw m7, m3
1335 packuswb m6, m7
1336
1337 pmaddubsw m1, [r3 + 6 * 16] ; [23]
1338 pmulhrsw m1, m3
1339
1340 palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9]
1341
1342 pmaddubsw m2, [r4 + 6 * 16] ; [8]
1343 pmulhrsw m2, m3
1344 packuswb m1, m2
1345 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1346
1347 cglobal intra_pred_ang8_6, 3,5,8
1348 cmp r4m, byte 30
1349 cmove r2, r3mp
1350 lea r3, [ang_table + 20 * 16]
1351 lea r4, [ang_table + 8 * 16]
1352 mova m7, [pw_1024]
1353
1354 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1355 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1356
1357 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1358 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1359 mova m1, m0
1360
1361 pmaddubsw m4, m0, [r3 - 7 * 16] ; [13]
1362 pmulhrsw m4, m7
1363 pmaddubsw m1, [r3 + 6 * 16] ; [26]
1364 pmulhrsw m1, m7
1365 packuswb m4, m1
1366
1367 palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1368
1369 pmaddubsw m5, m6, [r4 - 1 * 16] ; [7]
1370 pmulhrsw m5, m7
1371
1372 pmaddubsw m6, [r3] ; [20]
1373 pmulhrsw m6, m7
1374 packuswb m5, m6
1375
1376 palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1377
1378 pmaddubsw m6, m1, [r4 - 7 * 16] ; [1]
1379 pmulhrsw m6, m7
1380
1381 mova m3, m1
1382 pmaddubsw m3, [r3 - 6 * 16] ; [14]
1383 pmulhrsw m3, m7
1384 packuswb m6, m3
1385
1386 pmaddubsw m1, [r3 + 7 * 16] ; [27]
1387 pmulhrsw m1, m7
1388
1389 palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1390
1391 pmaddubsw m2, [r4] ; [8]
1392 pmulhrsw m2, m7
1393 packuswb m1, m2
1394 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1395
1396 cglobal intra_pred_ang8_7, 3,5,8
1397 cmp r4m, byte 29
1398 cmove r2, r3mp
1399 lea r3, [ang_table + 24 * 16]
1400 lea r4, [ang_table + 6 * 16]
1401 mova m7, [pw_1024]
1402
1403 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1404 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1405
1406 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1407 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1408
1409 pmaddubsw m4, m0, [r4 + 3 * 16] ; [9]
1410 pmulhrsw m4, m7
1411 pmaddubsw m3, m0, [r3 - 6 * 16] ; [18]
1412 pmulhrsw m3, m7
1413 packuswb m4, m3
1414
1415 pmaddubsw m5, m0, [r3 + 3 * 16] ; [27]
1416 pmulhrsw m5, m7
1417
1418 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1419
1420 pmaddubsw m6, m1, [r4 - 2 * 16] ; [4]
1421 pmulhrsw m6, m7
1422 packuswb m5, m6
1423
1424 pmaddubsw m6, m1, [r4 + 7 * 16] ; [13]
1425 pmulhrsw m6, m7
1426
1427 mova m3, m1
1428 pmaddubsw m3, [r3 - 2 * 16] ; [22]
1429 pmulhrsw m3, m7
1430 packuswb m6, m3
1431
1432 pmaddubsw m1, [r3 + 7 * 16] ; [31]
1433 pmulhrsw m1, m7
1434
1435 palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1436
1437 pmaddubsw m2, [r4 + 2 * 16] ; [8]
1438 pmulhrsw m2, m7
1439 packuswb m1, m2
1440 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1441
1442 cglobal intra_pred_ang8_8, 3,5,8
1443 cmp r4m, byte 28
1444 cmove r2, r3mp
1445 lea r3, [ang_table + 23 * 16]
1446 lea r4, [ang_table + 8 * 16]
1447 mova m7, [pw_1024]
1448
1449 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1450 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1451
1452 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1453 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1454 palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1455
1456 pmaddubsw m4, m0, [r4 - 3 * 16] ; [5]
1457 pmulhrsw m4, m7
1458 pmaddubsw m3, m0, [r4 + 2 * 16] ; [10]
1459 pmulhrsw m3, m7
1460 packuswb m4, m3
1461
1462 pmaddubsw m5, m0, [r3 - 8 * 16] ; [15]
1463 pmulhrsw m5, m7
1464
1465 pmaddubsw m6, m0, [r3 - 3 * 16] ; [20]
1466 pmulhrsw m6, m7
1467 packuswb m5, m6
1468
1469 pmaddubsw m6, m0, [r3 + 2 * 16] ; [25]
1470 pmulhrsw m6, m7
1471
1472 pmaddubsw m0, [r3 + 7 * 16] ; [30]
1473 pmulhrsw m0, m7
1474 packuswb m6, m0
1475
1476 pmaddubsw m1, m2, [r4 - 5 * 16] ; [3]
1477 pmulhrsw m1, m7
1478
1479 pmaddubsw m2, [r4] ; [8]
1480 pmulhrsw m2, m7
1481 packuswb m1, m2
1482 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1483
1484 cglobal intra_pred_ang8_9, 3,5,8
1485 cmp r4m, byte 27
1486 cmove r2, r3mp
1487 lea r3, [ang_table + 10 * 16]
1488 mova m7, [pw_1024]
1489
1490 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1491 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1492
1493 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1494
1495 pmaddubsw m4, m0, [r3 - 8 * 16] ; [2]
1496 pmulhrsw m4, m7
1497 pmaddubsw m3, m0, [r3 - 6 * 16] ; [4]
1498 pmulhrsw m3, m7
1499 packuswb m4, m3
1500
1501 pmaddubsw m5, m0, [r3 - 4 * 16] ; [6]
1502 pmulhrsw m5, m7
1503
1504 pmaddubsw m6, m0, [r3 - 2 * 16] ; [8]
1505 pmulhrsw m6, m7
1506 packuswb m5, m6
1507
1508 pmaddubsw m6, m0, [r3] ; [10]
1509 pmulhrsw m6, m7
1510
1511 pmaddubsw m2, m0, [r3 + 2 * 16] ; [12]
1512 pmulhrsw m2, m7
1513 packuswb m6, m2
1514
1515 pmaddubsw m1, m0, [r3 + 4 * 16] ; [14]
1516 pmulhrsw m1, m7
1517
1518 pmaddubsw m0, [r3 + 6 * 16] ; [16]
1519 pmulhrsw m0, m7
1520 packuswb m1, m0
1521 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1522
1523 cglobal intra_pred_ang8_10, 4,5,5
1524 movh m0, [r2 + 1]
1525 mova m4, [pb_unpackbq]
1526 palignr m1, m0, 2
1527 pshufb m1, m4
1528 palignr m2, m0, 4
1529 pshufb m2, m4
1530 palignr m3, m0, 6
1531 pshufb m3, m4
1532 pshufb m0, m4
1533
1534 lea r4, [r1 * 3]
1535 movhps [r0 + r1], m0
1536 movh [r0 + r1 * 2], m1
1537 movhps [r0 + r4], m1
1538 lea r2, [r0 + r1 * 4]
1539 movh [r2], m2
1540 movhps [r2 + r1], m2
1541 movh [r2 + r1 * 2], m3
1542 movhps [r2 + r4], m3
1543
1544 ; filter
1545 cmp r5m, byte 0
1546 jz .quit
1547
1548 pmovzxbw m0, m0
1549 movu m1, [r3]
1550 palignr m2, m1, 1
1551 pshufb m1, m4
1552 pmovzxbw m1, m1
1553 pmovzxbw m2, m2
1554 psubw m2, m1
1555 psraw m2, 1
1556 paddw m0, m2
1557 packuswb m0, m0
1558
1559 .quit:
1560 movh [r0], m0
1561 RET
1562
1563 cglobal intra_pred_ang8_26, 4,5,3
1564 movh m0, [r3 + 1]
1565
1566 lea r4, [r1 * 3]
1567 movh [r0], m0
1568 movh [r0 + r1], m0
1569 movh [r0 + r1 * 2], m0
1570 movh [r0 + r4], m0
1571 lea r3, [r0 + r1 * 4]
1572 movh [r3], m0
1573 movh [r3 + r1], m0
1574 movh [r3 + r1 * 2], m0
1575 movh [r3 + r4], m0
1576
1577 ; filter
1578 cmp r5m, byte 0
1579 jz .quit
1580
1581 pshufb m0, [pb_unpackbq]
1582 pmovzxbw m0, m0
1583 movu m1, [r2]
1584 palignr m2, m1, 1
1585 pshufb m1, [pb_unpackbq]
1586 pmovzxbw m1, m1
1587 pmovzxbw m2, m2
1588 psubw m2, m1
1589 psraw m2, 1
1590 paddw m0, m2
1591 packuswb m0, m0
1592 pextrb [r0], m0, 0
1593 pextrb [r0 + r1], m0, 1
1594 pextrb [r0 + r1 * 2], m0, 2
1595 pextrb [r0 + r4], m0, 3
1596 pextrb [r3], m0, 4
1597 pextrb [r3 + r1], m0, 5
1598 pextrb [r3 + r1 * 2], m0, 6
1599 pextrb [r3 + r4], m0, 7
1600
1601 .quit:
1602 RET
1603
1604 cglobal intra_pred_ang8_11, 3,5,8
1605 cmp r4m, byte 25
1606 cmove r2, r3mp
1607 lea r3, [ang_table + 23 * 16]
1608 mova m7, [pw_1024]
1609
1610 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1611 palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1612
1613 punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1614
1615 pmaddubsw m4, m0, [r3 + 7 * 16] ; [30]
1616 pmulhrsw m4, m7
1617 pmaddubsw m3, m0, [r3 + 5 * 16] ; [28]
1618 pmulhrsw m3, m7
1619 packuswb m4, m3
1620
1621 pmaddubsw m5, m0, [r3 + 3 * 16] ; [26]
1622 pmulhrsw m5, m7
1623
1624 pmaddubsw m6, m0, [r3 + 1 * 16] ; [24]
1625 pmulhrsw m6, m7
1626 packuswb m5, m6
1627
1628 pmaddubsw m6, m0, [r3 - 1 * 16] ; [22]
1629 pmulhrsw m6, m7
1630
1631 pmaddubsw m2, m0, [r3 - 3 * 16] ; [20]
1632 pmulhrsw m2, m7
1633 packuswb m6, m2
1634
1635 pmaddubsw m1, m0, [r3 - 5 * 16] ; [18]
1636 pmulhrsw m1, m7
1637
1638 pmaddubsw m0, [r3 - 7 * 16] ; [16]
1639 pmulhrsw m0, m7
1640 packuswb m1, m0
1641 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1642
1643 cglobal intra_pred_ang8_12, 4,5,8
1644 cmp r4m, byte 24
1645 mov r4, r2
1646 cmovz r2, r3
1647 cmovz r3, r4
1648
1649 lea r4, [ang_table + 22 * 16]
1650 mova m7, [pw_1024]
1651
1652 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1653 pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
1654 pinsrb m0, [r3 + 6], 0
1655 punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
1656 punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1657 palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1658
1659 pmaddubsw m4, m2, [r4 + 5 * 16] ; [27]
1660 pmulhrsw m4, m7
1661 pmaddubsw m3, m2, [r4] ; [22]
1662 pmulhrsw m3, m7
1663 packuswb m4, m3
1664
1665 pmaddubsw m1, m0, [r4 + 7 * 16] ; [29]
1666 pmulhrsw m1, m7
1667
1668 pmaddubsw m0, [r4 + 2 * 16] ; [24]
1669 pmulhrsw m0, m7
1670 packuswb m1, m0
1671
1672 pmaddubsw m5, m2, [r4 - 5 * 16] ; [17]
1673 pmulhrsw m5, m7
1674
1675 lea r4, [ang_table + 7 * 16]
1676 pmaddubsw m6, m2, [r4 + 5 * 16] ; [12]
1677 pmulhrsw m6, m7
1678 packuswb m5, m6
1679
1680 pmaddubsw m6, m2, [r4] ; [7]
1681 pmulhrsw m6, m7
1682
1683 pmaddubsw m2, [r4 - 5 * 16] ; [2]
1684 pmulhrsw m2, m7
1685 packuswb m6, m2
1686 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1687
1688 cglobal intra_pred_ang8_13, 4,5,8
1689 cmp r4m, byte 23
1690 mov r4, r2
1691 cmovz r2, r3
1692 cmovz r3, r4
1693
1694 lea r4, [ang_table + 24 * 16]
1695 mova m7, [pw_1024]
1696
1697 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1698 pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
1699 pinsrb m1, [r3 + 4], 0
1700 pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
1701 pinsrb m0, [r3 + 7], 0
1702 punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
1703 punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1704 palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1705 palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1706
1707 pmaddubsw m4, m5, [r4 - 1 * 16] ; [23]
1708 pmulhrsw m4, m7
1709
1710 pmaddubsw m6, m1, [r4 + 4 * 16] ; [28]
1711 pmulhrsw m6, m7
1712
1713 pmaddubsw m0, [r4] ; [24]
1714 pmulhrsw m0, m7
1715
1716 lea r4, [ang_table + 13 * 16]
1717 pmaddubsw m3, m5, [r4 + 1 * 16] ; [14]
1718 pmulhrsw m3, m7
1719 packuswb m4, m3
1720
1721 pmaddubsw m5, [r4 - 8 * 16] ; [5]
1722 pmulhrsw m5, m7
1723 packuswb m5, m6
1724
1725 pmaddubsw m6, m1, [r4 + 6 * 16] ; [19]
1726 pmulhrsw m6, m7
1727
1728 pmaddubsw m2, m1, [r4 - 3 * 16] ; [10]
1729 pmulhrsw m2, m7
1730 packuswb m6, m2
1731
1732 pmaddubsw m1, [r4 - 12 * 16] ; [1]
1733 pmulhrsw m1, m7
1734 packuswb m1, m0
1735 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1736
1737 cglobal intra_pred_ang8_14, 4,5,8
1738 cmp r4m, byte 22
1739 mov r4, r2
1740 cmovz r2, r3
1741 cmovz r3, r4
1742
1743 lea r4, [ang_table + 24 * 16]
1744 mova m3, [pw_1024]
1745
1746 movu m1, [r2 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
1747 pinsrb m1, [r3 + 2], 1
1748 pinsrb m1, [r3 + 5], 0
1749 pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
1750 pinsrb m0, [r3 + 7], 0
1751 punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
1752 punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1753 palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1754 palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1755 palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1756
1757 pmaddubsw m4, m2, [r4 - 5 * 16] ; [19]
1758 pmulhrsw m4, m3
1759
1760 pmaddubsw m0, [r4] ; [24]
1761 pmulhrsw m0, m3
1762
1763 pmaddubsw m5, m6, [r4 + 1 * 16] ; [25]
1764 pmulhrsw m5, m3
1765
1766 lea r4, [ang_table + 12 * 16]
1767 pmaddubsw m6, [r4] ; [12]
1768 pmulhrsw m6, m3
1769 packuswb m5, m6
1770
1771 pmaddubsw m6, m1, [r4 + 19 * 16] ; [31]
1772 pmulhrsw m6, m3
1773
1774 pmaddubsw m2, [r4 - 6 * 16] ; [6]
1775 pmulhrsw m2, m3
1776 packuswb m4, m2
1777
1778 pmaddubsw m2, m1, [r4 + 6 * 16] ; [18]
1779 pmulhrsw m2, m3
1780 packuswb m6, m2
1781
1782 pmaddubsw m1, [r4 - 7 * 16] ; [5]
1783 pmulhrsw m1, m3
1784 packuswb m1, m0
1785 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1786
1787 cglobal intra_pred_ang8_15, 4,5,8
1788 cmp r4m, byte 21
1789 mov r4, r2
1790 cmovz r2, r3
1791 cmovz r3, r4
1792
1793 lea r4, [ang_table + 23 * 16]
1794 mova m3, [pw_1024]
1795
1796 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1797 movu m2, [r3]
1798 pshufb m2, [c_mode16_15]
1799 palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
1800 pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
1801 pinsrb m0, [r3 + 8], 0
1802 punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1803 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
1804 palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1805 palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1806 palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1807 palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1808
1809 pmaddubsw m4, [r4 - 8 * 16] ; [15]
1810 pmulhrsw m4, m3
1811
1812 pmaddubsw m2, m5, [r4 + 7 * 16] ; [30]
1813 pmulhrsw m2, m3
1814 packuswb m4, m2
1815
1816 pmaddubsw m5, [r4 - 10 * 16] ; [13]
1817 pmulhrsw m5, m3
1818
1819 pmaddubsw m2, m6, [r4 + 5 * 16] ; [28]
1820 pmulhrsw m2, m3
1821 packuswb m5, m2
1822
1823 pmaddubsw m2, m1, [r4 + 3 * 16] ; [26]
1824 pmulhrsw m2, m3
1825
1826 pmaddubsw m0, [r4 + 1 * 16] ; [24]
1827 pmulhrsw m0, m3
1828
1829 lea r4, [ang_table + 11 * 16]
1830 pmaddubsw m6, [r4] ; [11]
1831 pmulhrsw m6, m3
1832 packuswb m6, m2
1833
1834 pmaddubsw m1, [r4 - 2 * 16] ; [9]
1835 pmulhrsw m1, m3
1836 packuswb m1, m0
1837 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1838
1839 cglobal intra_pred_ang8_16, 4,5,8
1840 cmp r4m, byte 20
1841 mov r4, r2
1842 cmovz r2, r3
1843 cmovz r3, r4
1844
1845 lea r4, [ang_table + 22 * 16]
1846 mova m7, [pw_1024]
1847
1848 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1849 movu m2, [r3]
1850 pshufb m2, [c_mode16_16]
1851 palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
1852 pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
1853 pinsrb m0, [r3 + 8], 0
1854 punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1855 punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e]
1856 palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
1857 palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1858 palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1859 palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1860 palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1861
1862 pmaddubsw m3, m5, [r4] ; [22]
1863 pmulhrsw m3, m7
1864
1865 pmaddubsw m0, [r4 + 2 * 16] ; [24]
1866 pmulhrsw m0, m7
1867
1868 lea r4, [ang_table + 9 * 16]
1869
1870 pmaddubsw m4, [r4 + 2 * 16] ; [11]
1871 pmulhrsw m4, m7
1872 packuswb m4, m3
1873
1874 pmaddubsw m2, [r4 + 3 * 16] ; [12]
1875 pmulhrsw m2, m7
1876
1877 pmaddubsw m5, [r4 - 8 * 16] ; [1]
1878 pmulhrsw m5, m7
1879 packuswb m5, m2
1880
1881 mova m2, m6
1882 pmaddubsw m6, [r4 + 14 * 16] ; [23]
1883 pmulhrsw m6, m7
1884
1885 pmaddubsw m2, [r4 - 7 * 16] ; [2]
1886 pmulhrsw m2, m7
1887 packuswb m6, m2
1888
1889 pmaddubsw m1, [r4 + 4 * 16] ; [13]
1890 pmulhrsw m1, m7
1891 packuswb m1, m0
1892 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1893
1894 cglobal intra_pred_ang8_17, 4,5,8
1895 cmp r4m, byte 19
1896 mov r4, r2
1897 cmovz r2, r3
1898 cmovz r3, r4
1899
1900 lea r4, [ang_table + 17 * 16]
1901 mova m3, [pw_1024]
1902
1903 movu m2, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1904 movu m1, [r3]
1905 pshufb m1, [c_mode16_17]
1906 palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
1907 pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f]
1908 pinsrb m0, [r3 + 7], 0
1909 punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1910 punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f]
1911
1912 palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1913 palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1914 palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1915
1916
1917 pmaddubsw m2, [r4 - 5 * 16] ; [12]
1918 pmulhrsw m2, m3
1919
1920 pmaddubsw m4, [r4 - 11 * 16] ; [6]
1921 pmulhrsw m4, m3
1922 packuswb m4, m2
1923
1924 pmaddubsw m5, [r4 + 1 * 16] ; [18]
1925 pmulhrsw m5, m3
1926
1927 palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1928 pmaddubsw m2, [r4 + 7 * 16] ; [24]
1929 pmulhrsw m2, m3
1930 packuswb m5, m2
1931
1932 palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
1933 mova m2, m6
1934 pmaddubsw m6, [r4 + 13 * 16] ; [30]
1935 pmulhrsw m6, m3
1936
1937 pmaddubsw m2, [r4 - 13 * 16] ; [4]
1938 pmulhrsw m2, m3
1939 packuswb m6, m2
1940
1941 palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e]
1942 pmaddubsw m1, [r4 - 7 * 16] ; [10]
1943 pmulhrsw m1, m3
1944
1945 pmaddubsw m0, [r4 - 1 * 16] ; [16]
1946 pmulhrsw m0, m3
1947 packuswb m1, m0
1948 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1949
1950 cglobal intra_pred_ang8_18, 4,4,1
1951 movu m0, [r2]
1952 pshufb m0, [pb_swap8]
1953 movhps m0, [r3 + 1]
1954 lea r2, [r0 + r1 * 4]
1955 lea r3, [r1 * 3]
1956 movh [r2 + r3], m0
1957 psrldq m0, 1
1958 movh [r2 + r1 * 2], m0
1959 psrldq m0, 1
1960 movh [r2 + r1], m0
1961 psrldq m0, 1
1962 movh [r2], m0
1963 psrldq m0, 1
1964 movh [r0 + r3], m0
1965 psrldq m0, 1
1966 movh [r0 + r1 * 2], m0
1967 psrldq m0, 1
1968 movh [r0 + r1], m0
1969 psrldq m0, 1
1970 movh [r0], m0
1971 RET
1972
1973
1974 ;-----------------------------------------------------------------------------
1975 ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
1976 ;-----------------------------------------------------------------------------
1977 INIT_XMM ssse3
1978 cglobal intra_pred_ang16_2, 3,3,3
1979 cmp r4m, byte 34
1980 cmove r2, r3mp
1981 movu m0, [r2 + 2]
1982 movu m1, [r2 + 18]
1983 movu [r0], m0
1984 palignr m2, m1, m0, 1
1985 movu [r0 + r1], m2
1986 lea r0, [r0 + r1 * 2]
1987 palignr m2, m1, m0, 2
1988 movu [r0], m2
1989 palignr m2, m1, m0, 3
1990 movu [r0 + r1], m2
1991 lea r0, [r0 + r1 * 2]
1992 palignr m2, m1, m0, 4
1993 movu [r0], m2
1994 palignr m2, m1, m0, 5
1995 movu [r0 + r1], m2
1996 lea r0, [r0 + r1 * 2]
1997 palignr m2, m1, m0, 6
1998 movu [r0], m2
1999 palignr m2, m1, m0, 7
2000 movu [r0 + r1], m2
2001 lea r0, [r0 + r1 * 2]
2002 palignr m2, m1, m0, 8
2003 movu [r0], m2
2004 palignr m2, m1, m0, 9
2005 movu [r0 + r1], m2
2006 lea r0, [r0 + r1 * 2]
2007 palignr m2, m1, m0, 10
2008 movu [r0], m2
2009 palignr m2, m1, m0, 11
2010 movu [r0 + r1], m2
2011 lea r0, [r0 + r1 * 2]
2012 palignr m2, m1, m0, 12
2013 movu [r0], m2
2014 palignr m2, m1, m0, 13
2015 movu [r0 + r1], m2
2016 lea r0, [r0 + r1 * 2]
2017 palignr m2, m1, m0, 14
2018 movu [r0], m2
2019 palignr m2, m1, m0, 15
2020 movu [r0 + r1], m2
2021 RET
2022
2023 %macro TRANSPOSE_STORE_8x8 6
2024 %if %2 == 1
2025 ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
2026 punpckhbw m0, %3, %4
2027 punpcklbw %3, %4
2028 punpckhbw %4, %3, m0
2029 punpcklbw %3, m0
2030
2031 punpckhbw m0, %5, m1
2032 punpcklbw %5, %6
2033 punpckhbw %6, %5, m0
2034 punpcklbw %5, m0
2035
2036 punpckhdq m0, %3, %5
2037 punpckldq %3, %5
2038 punpckldq %5, %4, %6
2039 punpckhdq %4, %6
2040
2041 movh [r0 + + %1 * 8], %3
2042 movhps [r0 + r1 + %1 * 8], %3
2043 movh [r0 + r1*2 + %1 * 8], m0
2044 movhps [r0 + r5 + %1 * 8], m0
2045 movh [r6 + %1 * 8], %5
2046 movhps [r6 + r1 + %1 * 8], %5
2047 movh [r6 + r1*2 + %1 * 8], %4
2048 movhps [r6 + r5 + %1 * 8], %4
2049 %else
2050 ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
2051 movh [r0 ], %3
2052 movhps [r0 + r1 ], %3
2053 movh [r0 + r1 * 2], %4
2054 movhps [r0 + r5 ], %4
2055 lea r0, [r0 + r1 * 4]
2056 movh [r0 ], %5
2057 movhps [r0 + r1 ], %5
2058 movh [r0 + r1 * 2], %6
2059 movhps [r0 + r5 ], %6
2060 lea r0, [r0 + r1 * 4]
2061 %endif
2062 %endmacro
2063
2064 INIT_XMM sse4
2065 cglobal intra_pred_ang16_3, 3,7,8
2066
2067 lea r3, [ang_table + 16 * 16]
2068 mov r4d, 2
2069 lea r5, [r1 * 3] ; r5 -> 3 * stride
2070 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2071 mova m7, [pw_1024]
2072
2073 .loop:
2074 movu m0, [r2 + 1]
2075 palignr m1, m0, 1
2076
2077 punpckhbw m2, m0, m1
2078 punpcklbw m0, m1
2079 palignr m1, m2, m0, 2
2080
2081 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
2082 pmulhrsw m4, m7
2083 pmaddubsw m1, [r3 + 4 * 16] ; [20]
2084 pmulhrsw m1, m7
2085 packuswb m4, m1
2086
2087 palignr m5, m2, m0, 4
2088
2089 pmaddubsw m5, [r3 - 2 * 16] ; [14]
2090 pmulhrsw m5, m7
2091
2092 palignr m6, m2, m0, 6
2093
2094 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
2095 pmulhrsw m6, m7
2096 packuswb m5, m6
2097
2098 palignr m1, m2, m0, 8
2099
2100 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
2101 pmulhrsw m6, m7
2102
2103 pmaddubsw m1, [r3 + 12 * 16] ; [28]
2104 pmulhrsw m1, m7
2105 packuswb m6, m1
2106
2107 palignr m1, m2, m0, 10
2108
2109 pmaddubsw m1, [r3 + 6 * 16] ; [22]
2110 pmulhrsw m1, m7
2111
2112 palignr m2, m0, 12
2113
2114 pmaddubsw m2, [r3] ; [16]
2115 pmulhrsw m2, m7
2116 packuswb m1, m2
2117
2118 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2119
2120 movu m0, [r2 + 8]
2121 palignr m1, m0, 1
2122
2123 punpckhbw m2, m0, m1
2124 punpcklbw m0, m1
2125 palignr m5, m2, m0, 2
2126
2127 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
2128 pmulhrsw m4, m7
2129 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
2130 pmulhrsw m1, m7
2131 packuswb m4, m1
2132
2133 pmaddubsw m5, [r3 + 14 * 16] ; [30]
2134 pmulhrsw m5, m7
2135
2136 palignr m6, m2, m0, 4
2137
2138 pmaddubsw m6, [r3 + 8 * 16] ; [24]
2139 pmulhrsw m6, m7
2140 packuswb m5, m6
2141
2142 palignr m1, m2, m0, 6
2143
2144 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
2145 pmulhrsw m6, m7
2146
2147 palignr m1, m2, m0, 8
2148
2149 pmaddubsw m1, [r3 - 4 * 16] ; [12]
2150 pmulhrsw m1, m7
2151 packuswb m6, m1
2152
2153 palignr m1, m2, m0, 10
2154
2155 pmaddubsw m1, [r3 - 10 * 16] ; [06]
2156 pmulhrsw m1, m7
2157 packuswb m1, m1
2158
2159 movhps m1, [r2 + 14] ; [00]
2160
2161 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2162
2163 lea r0, [r6 + r1 * 4]
2164 lea r6, [r6 + r1 * 8]
2165 add r2, 8
2166 dec r4
2167 jnz .loop
2168
2169 RET
2170
2171 INIT_XMM sse4
2172 cglobal intra_pred_ang16_33, 3,7,8
2173 mov r2, r3mp
2174 lea r3, [ang_table + 16 * 16]
2175 mov r4d, 2
2176 lea r5, [r1 * 3]
2177 mov r6, r0
2178 mova m7, [pw_1024]
2179
2180 .loop:
2181 movu m0, [r2 + 1]
2182 palignr m1, m0, 1
2183
2184 punpckhbw m2, m0, m1
2185 punpcklbw m0, m1
2186 palignr m1, m2, m0, 2
2187
2188 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
2189 pmulhrsw m4, m7
2190 pmaddubsw m1, [r3 + 4 * 16] ; [20]
2191 pmulhrsw m1, m7
2192 packuswb m4, m1
2193
2194 palignr m5, m2, m0, 4
2195
2196 pmaddubsw m5, [r3 - 2 * 16] ; [14]
2197 pmulhrsw m5, m7
2198
2199 palignr m6, m2, m0, 6
2200
2201 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
2202 pmulhrsw m6, m7
2203 packuswb m5, m6
2204
2205 palignr m1, m2, m0, 8
2206
2207 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
2208 pmulhrsw m6, m7
2209
2210 pmaddubsw m1, [r3 + 12 * 16] ; [28]
2211 pmulhrsw m1, m7
2212 packuswb m6, m1
2213
2214 palignr m1, m2, m0, 10
2215
2216 pmaddubsw m1, [r3 + 6 * 16] ; [22]
2217 pmulhrsw m1, m7
2218
2219 palignr m2, m0, 12
2220
2221 pmaddubsw m2, [r3] ; [16]
2222 pmulhrsw m2, m7
2223 packuswb m1, m2
2224
2225 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2226
2227 movu m0, [r2 + 8]
2228 palignr m1, m0, 1
2229
2230 punpckhbw m2, m0, m1
2231 punpcklbw m0, m1
2232 palignr m5, m2, m0, 2
2233
2234 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
2235 pmulhrsw m4, m7
2236 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
2237 pmulhrsw m1, m7
2238 packuswb m4, m1
2239
2240 pmaddubsw m5, [r3 + 14 * 16] ; [30]
2241 pmulhrsw m5, m7
2242
2243 palignr m6, m2, m0, 4
2244
2245 pmaddubsw m6, [r3 + 8 * 16] ; [24]
2246 pmulhrsw m6, m7
2247 packuswb m5, m6
2248
2249 palignr m1, m2, m0, 6
2250
2251 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
2252 pmulhrsw m6, m7
2253
2254 palignr m1, m2, m0, 8
2255
2256 pmaddubsw m1, [r3 - 4 * 16] ; [12]
2257 pmulhrsw m1, m7
2258 packuswb m6, m1
2259
2260 palignr m1, m2, m0, 10
2261
2262 pmaddubsw m1, [r3 - 10 * 16] ; [06]
2263 pmulhrsw m1, m7
2264 packuswb m1, m1
2265
2266 movh m2, [r2 + 14] ; [00]
2267
2268 movh [r0 ], m4
2269 movhps [r0 + r1 ], m4
2270 movh [r0 + r1 * 2], m5
2271 movhps [r0 + r5 ], m5
2272 lea r0, [r0 + r1 * 4]
2273 movh [r0 ], m6
2274 movhps [r0 + r1 ], m6
2275 movh [r0 + r1 * 2], m1
2276 movh [r0 + r5 ], m2
2277
2278 lea r0, [r6 + 8]
2279 add r2, 8
2280 dec r4
2281 jnz .loop
2282
2283 RET
2284
2285 INIT_XMM sse4
2286 cglobal intra_pred_ang16_4, 3,7,8
2287
2288 lea r3, [ang_table + 16 * 16]
2289 mov r4d, 2
2290 lea r5, [r1 * 3] ; r5 -> 3 * stride
2291 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2292 mova m7, [pw_1024]
2293
2294 .loop:
2295 movu m0, [r2 + 1]
2296 palignr m1, m0, 1
2297
2298 punpckhbw m2, m0, m1
2299 punpcklbw m0, m1
2300 palignr m1, m2, m0, 2
2301 mova m5, m1
2302
2303 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
2304 pmulhrsw m4, m7
2305 pmaddubsw m1, [r3 - 6 * 16] ; [10]
2306 pmulhrsw m1, m7
2307 packuswb m4, m1
2308
2309 pmaddubsw m5, [r3 + 15 * 16] ; [31]
2310 pmulhrsw m5, m7
2311
2312 palignr m6, m2, m0, 4
2313
2314 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
2315 pmulhrsw m6, m7
2316 packuswb m5, m6
2317
2318 palignr m1, m2, m0, 6
2319
2320 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
2321 pmulhrsw m6, m7
2322
2323 pmaddubsw m1, [r3 + 14 * 16] ; [30]
2324 pmulhrsw m1, m7
2325 packuswb m6, m1
2326
2327 palignr m1, m2, m0, 8
2328
2329 pmaddubsw m1, [r3 + 3 * 16] ; [19]
2330 pmulhrsw m1, m7
2331
2332 palignr m2, m0, 10
2333
2334 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
2335 pmulhrsw m3, m7
2336 packuswb m1, m3
2337
2338 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2339
2340 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
2341 pmulhrsw m4, m7
2342
2343 movu m0, [r2 + 6]
2344 palignr m1, m0, 1
2345
2346 punpckhbw m2, m0, m1
2347 punpcklbw m0, m1
2348 palignr m1, m2, m0, 2
2349
2350 pmaddubsw m1, [r3 + 2 * 16] ; [18]
2351 pmulhrsw m1, m7
2352 packuswb m4, m1
2353
2354 palignr m5, m2, m0, 4
2355 mova m6, m5
2356
2357 pmaddubsw m5, [r3 - 9 * 16] ; [07]
2358 pmulhrsw m5, m7
2359
2360 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2361 pmulhrsw m6, m7
2362 packuswb m5, m6
2363
2364 palignr m6, m2, m0, 6
2365
2366 pmaddubsw m6, [r3 + 16] ; [17]
2367 pmulhrsw m6, m7
2368
2369 palignr m1, m2, m0, 8
2370 palignr m2, m0, 10
2371
2372 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
2373 pmulhrsw m3, m7
2374 packuswb m6, m3
2375
2376 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2377 pmulhrsw m1, m7
2378
2379 pmaddubsw m2, [r3] ; [16]
2380 pmulhrsw m2, m7
2381 packuswb m1, m2
2382
2383 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2384
2385 lea r0, [r6 + r1 * 4]
2386 lea r6, [r6 + r1 * 8]
2387 add r2, 8
2388 dec r4
2389 jnz .loop
2390
2391 RET
2392
2393 INIT_XMM sse4
2394 cglobal intra_pred_ang16_32, 3,7,8
2395 mov r2, r3mp
2396 lea r3, [ang_table + 16 * 16]
2397 mov r4d, 2
2398 lea r5, [r1 * 3] ; r5 -> 3 * stride
2399 mov r6, r0
2400 mova m7, [pw_1024]
2401
2402 .loop:
2403 movu m0, [r2 + 1]
2404 palignr m1, m0, 1
2405
2406 punpckhbw m2, m0, m1
2407 punpcklbw m0, m1
2408 palignr m1, m2, m0, 2
2409 mova m5, m1
2410
2411
2412 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
2413 pmulhrsw m4, m7
2414 pmaddubsw m1, [r3 - 6 * 16] ; [10]
2415 pmulhrsw m1, m7
2416 packuswb m4, m1
2417
2418 pmaddubsw m5, [r3 + 15 * 16] ; [31]
2419 pmulhrsw m5, m7
2420
2421 palignr m6, m2, m0, 4
2422
2423 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
2424 pmulhrsw m6, m7
2425 packuswb m5, m6
2426
2427 palignr m1, m2, m0, 6
2428
2429 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
2430 pmulhrsw m6, m7
2431
2432 pmaddubsw m1, [r3 + 14 * 16] ; [30]
2433 pmulhrsw m1, m7
2434 packuswb m6, m1
2435
2436 palignr m1, m2, m0, 8
2437
2438 pmaddubsw m1, [r3 + 3 * 16] ; [19]
2439 pmulhrsw m1, m7
2440
2441 palignr m2, m0, 10
2442
2443 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
2444 pmulhrsw m3, m7
2445 packuswb m1, m3
2446
2447 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2448
2449 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
2450 pmulhrsw m4, m7
2451
2452 movu m0, [r2 + 6]
2453 palignr m1, m0, 1
2454
2455 punpckhbw m2, m0, m1
2456 punpcklbw m0, m1
2457 palignr m1, m2, m0, 2
2458
2459 pmaddubsw m1, [r3 + 2 * 16] ; [18]
2460 pmulhrsw m1, m7
2461 packuswb m4, m1
2462
2463 palignr m5, m2, m0, 4
2464 mova m6, m5
2465
2466 pmaddubsw m5, [r3 - 9 * 16] ; [07]
2467 pmulhrsw m5, m7
2468
2469 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2470 pmulhrsw m6, m7
2471 packuswb m5, m6
2472
2473 palignr m6, m2, m0, 6
2474
2475 pmaddubsw m6, [r3 + 16] ; [17]
2476 pmulhrsw m6, m7
2477
2478 palignr m1, m2, m0, 8
2479 palignr m2, m0, 10
2480
2481 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
2482 pmulhrsw m3, m7
2483 packuswb m6, m3
2484
2485 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2486 pmulhrsw m1, m7
2487
2488 pmaddubsw m2, [r3] ; [16]
2489 pmulhrsw m2, m7
2490 packuswb m1, m2
2491
2492 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
2493
2494 lea r0, [r6 + 8]
2495 add r2, 8
2496 dec r4
2497 jnz .loop
2498
2499 RET
2500
2501 INIT_XMM sse4
2502 cglobal intra_pred_ang16_5, 3,7,8
2503
2504 lea r3, [ang_table + 16 * 16]
2505 mov r4d, 2
2506 lea r5, [r1 * 3] ; r5 -> 3 * stride
2507 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2508 mova m7, [pw_1024]
2509
2510 .loop:
2511 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2512 movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2513 punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2514 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2515
2516 palignr m5, m2, m3, 2
2517
2518 pmaddubsw m4, m3, [r3 + 16] ; [17]
2519 pmulhrsw m4, m7
2520 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
2521 pmulhrsw m1, m7
2522 packuswb m4, m1
2523
2524 palignr m6, m2, m3, 4
2525
2526 pmaddubsw m5, [r3 + 3 * 16] ; [19]
2527 pmulhrsw m5, m7
2528 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
2529 pmulhrsw m1, m7
2530 packuswb m5, m1
2531
2532 palignr m1, m2, m3, 6
2533
2534 pmaddubsw m6, [r3 + 5 * 16] ; [21]
2535 pmulhrsw m6, m7
2536 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
2537 pmulhrsw m0, m7
2538 packuswb m6, m0
2539
2540 palignr m0, m2, m3, 8
2541
2542 pmaddubsw m1, [r3 + 7 * 16] ; [23]
2543 pmulhrsw m1, m7
2544 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2545 pmulhrsw m0, m7
2546 packuswb m1, m0
2547
2548 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2549
2550 palignr m4, m2, m3, 8
2551 palignr m5, m2, m3, 10
2552
2553 pmaddubsw m4, [r3 + 9 * 16] ; [25]
2554 pmulhrsw m4, m7
2555 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
2556 pmulhrsw m1, m7
2557 packuswb m4, m1
2558
2559 palignr m6, m2, m3, 12
2560
2561 pmaddubsw m5, [r3 + 11 * 16] ; [27]
2562 pmulhrsw m5, m7
2563 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
2564 pmulhrsw m1, m7
2565 packuswb m5, m1
2566
2567 palignr m1, m2, m3, 14
2568
2569 pmaddubsw m6, [r3 + 13 * 16] ; [29]
2570 pmulhrsw m6, m7
2571 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2572 pmulhrsw m0, m7
2573 packuswb m6, m0
2574
2575 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2576 pmulhrsw m1, m7
2577 pmaddubsw m2, [r3] ; [16]
2578 pmulhrsw m2, m7
2579 packuswb m1, m2
2580
2581 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2582
2583 lea r0, [r6 + r1 * 4]
2584 lea r6, [r6 + r1 * 8]
2585 add r2, 8
2586 dec r4
2587 jnz .loop
2588
2589 RET
2590
2591 INIT_XMM sse4
2592 cglobal intra_pred_ang16_31, 3,7,8
2593 mov r2, r3mp
2594 lea r3, [ang_table + 16 * 16]
2595 mov r4d, 2
2596 lea r5, [r1 * 3] ; r5 -> 3 * stride
2597 mov r6, r0
2598 mova m7, [pw_1024]
2599
2600 .loop:
2601 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2602 movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2603 punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2604 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2605
2606 palignr m5, m2, m3, 2
2607
2608 pmaddubsw m4, m3, [r3 + 16] ; [17]
2609 pmulhrsw m4, m7
2610 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
2611 pmulhrsw m1, m7
2612 packuswb m4, m1
2613
2614 palignr m6, m2, m3, 4
2615
2616 pmaddubsw m5, [r3 + 3 * 16] ; [19]
2617 pmulhrsw m5, m7
2618 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
2619 pmulhrsw m1, m7
2620 packuswb m5, m1
2621
2622 palignr m1, m2, m3, 6
2623
2624 pmaddubsw m6, [r3 + 5 * 16] ; [21]
2625 pmulhrsw m6, m7
2626 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
2627 pmulhrsw m0, m7
2628 packuswb m6, m0
2629
2630 palignr m0, m2, m3, 8
2631
2632 pmaddubsw m1, [r3 + 7 * 16] ; [23]
2633 pmulhrsw m1, m7
2634 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2635 pmulhrsw m0, m7
2636 packuswb m1, m0
2637
2638 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2639
2640 palignr m4, m2, m3, 8
2641 palignr m5, m2, m3, 10
2642
2643 pmaddubsw m4, [r3 + 9 * 16] ; [25]
2644 pmulhrsw m4, m7
2645 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
2646 pmulhrsw m1, m7
2647 packuswb m4, m1
2648
2649 palignr m6, m2, m3, 12
2650
2651 pmaddubsw m5, [r3 + 11 * 16] ; [27]
2652 pmulhrsw m5, m7
2653 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
2654 pmulhrsw m1, m7
2655 packuswb m5, m1
2656
2657 palignr m1, m2, m3, 14
2658
2659 pmaddubsw m6, [r3 + 13 * 16] ; [29]
2660 pmulhrsw m6, m7
2661 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2662 pmulhrsw m0, m7
2663 packuswb m6, m0
2664
2665 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2666 pmulhrsw m1, m7
2667 pmaddubsw m2, [r3] ; [16]
2668 pmulhrsw m2, m7
2669 packuswb m1, m2
2670
2671 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
2672
2673 lea r0, [r6 + 8]
2674 add r2, 8
2675 dec r4
2676 jnz .loop
2677
2678 RET
2679
2680 INIT_XMM sse4
2681 cglobal intra_pred_ang16_6, 3,7,8
2682
2683 lea r3, [ang_table + 16 * 16]
2684 mov r4d, 2
2685 lea r5, [r1 * 3] ; r5 -> 3 * stride
2686 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2687 mova m7, [pw_1024]
2688
2689 .loop:
2690 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2691 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2692 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2693 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2694
2695 pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
2696 pmulhrsw m4, m7
2697 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
2698 pmulhrsw m1, m7
2699 packuswb m4, m1
2700
2701 palignr m6, m2, m3, 2
2702
2703 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
2704 pmulhrsw m5, m7
2705 pmaddubsw m6, [r3 + 4 * 16] ; [20]
2706 pmulhrsw m6, m7
2707 packuswb m5, m6
2708
2709 palignr m1, m2, m3, 4
2710
2711 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
2712 pmulhrsw m6, m7
2713 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2714 pmulhrsw m0, m7
2715 packuswb m6, m0
2716
2717 palignr m0, m2, m3, 6
2718
2719 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2720 pmulhrsw m1, m7
2721 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2722 pmulhrsw m0, m7
2723 packuswb m1, m0
2724
2725 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2726
2727 palignr m4, m2, m3, 6
2728 palignr m6, m2, m3, 8
2729
2730 pmaddubsw m4, [r3 + 5 * 16] ; [21]
2731 pmulhrsw m4, m7
2732 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
2733 pmulhrsw m1, m7
2734 packuswb m4, m1
2735
2736 pmaddubsw m5, m6, [r3 - 16] ; [15]
2737 pmulhrsw m5, m7
2738 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2739 pmulhrsw m6, m7
2740 packuswb m5, m6
2741
2742 palignr m0, m2, m3, 10
2743
2744 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
2745 pmulhrsw m6, m7
2746 pmaddubsw m0, [r3 + 6 * 16] ; [22]
2747 pmulhrsw m0, m7
2748 packuswb m6, m0
2749
2750 palignr m2, m3, 12
2751
2752 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
2753 pmulhrsw m1, m7
2754 pmaddubsw m2, [r3] ; [16]
2755 pmulhrsw m2, m7
2756 packuswb m1, m2
2757
2758 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2759
2760 lea r0, [r6 + r1 * 4]
2761 lea r6, [r6 + r1 * 8]
2762 add r2, 8
2763 dec r4
2764 jnz .loop
2765
2766 RET
2767
2768 INIT_XMM sse4
2769 cglobal intra_pred_ang16_30, 3,7,8
2770 mov r2, r3mp
2771 lea r3, [ang_table + 16 * 16]
2772 mov r4d, 2
2773 lea r5, [r1 * 3] ; r5 -> 3 * stride
2774 mov r6, r0
2775 mova m7, [pw_1024]
2776
2777 .loop:
2778 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2779 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2780 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2781 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2782
2783 pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
2784 pmulhrsw m4, m7
2785 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
2786 pmulhrsw m1, m7
2787 packuswb m4, m1
2788
2789 palignr m6, m2, m3, 2
2790
2791 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
2792 pmulhrsw m5, m7
2793 pmaddubsw m6, [r3 + 4 * 16] ; [20]
2794 pmulhrsw m6, m7
2795 packuswb m5, m6
2796
2797 palignr m1, m2, m3, 4
2798
2799 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
2800 pmulhrsw m6, m7
2801 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2802 pmulhrsw m0, m7
2803 packuswb m6, m0
2804
2805 palignr m0, m2, m3, 6
2806
2807 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2808 pmulhrsw m1, m7
2809 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2810 pmulhrsw m0, m7
2811 packuswb m1, m0
2812
2813 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2814
2815 palignr m4, m2, m3, 6
2816 palignr m6, m2, m3, 8
2817
2818 pmaddubsw m4, [r3 + 5 * 16] ; [21]
2819 pmulhrsw m4, m7
2820 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
2821 pmulhrsw m1, m7
2822 packuswb m4, m1
2823
2824 pmaddubsw m5, m6, [r3 - 16] ; [15]
2825 pmulhrsw m5, m7
2826 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2827 pmulhrsw m6, m7
2828 packuswb m5, m6
2829
2830 palignr m0, m2, m3, 10
2831
2832 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
2833 pmulhrsw m6, m7
2834 pmaddubsw m0, [r3 + 6 * 16] ; [22]
2835 pmulhrsw m0, m7
2836 packuswb m6, m0
2837
2838 palignr m2, m3, 12
2839
2840 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
2841 pmulhrsw m1, m7
2842 pmaddubsw m2, [r3] ; [16]
2843 pmulhrsw m2, m7
2844 packuswb m1, m2
2845
2846 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
2847
2848 lea r0, [r6 + 8]
2849 add r2, 8
2850 dec r4
2851 jnz .loop
2852
2853 RET
2854
2855 INIT_XMM sse4
2856 cglobal intra_pred_ang16_7, 3,7,8
2857
2858 lea r3, [ang_table + 16 * 16]
2859 mov r4d, 2
2860 lea r5, [r1 * 3] ; r5 -> 3 * stride
2861 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2862 mova m7, [pw_1024]
2863
2864 .loop:
2865 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2866 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2867 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2868 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2869
2870 pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
2871 pmulhrsw m4, m7
2872 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
2873 pmulhrsw m0, m7
2874 packuswb m4, m0
2875
2876 palignr m1, m2, m3, 2
2877
2878 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
2879 pmulhrsw m5, m7
2880 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
2881 pmulhrsw m6, m7
2882 packuswb m5, m6
2883
2884 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
2885 pmulhrsw m6, m7
2886 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
2887 pmulhrsw m0, m7
2888 packuswb m6, m0
2889
2890 palignr m0, m2, m3, 4
2891
2892 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2893 pmulhrsw m1, m7
2894 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2895 pmulhrsw m0, m7
2896 packuswb m1, m0
2897
2898 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2899
2900 palignr m1, m2, m3, 4
2901
2902 pmaddubsw m4, m1, [r3 + 16] ; [17]
2903 pmulhrsw m4, m7
2904 pmaddubsw m1, [r3 + 10 * 16] ; [26]
2905 pmulhrsw m1, m7
2906 packuswb m4, m1
2907
2908 palignr m0, m2, m3, 6
2909
2910 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
2911 pmulhrsw m5, m7
2912 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
2913 pmulhrsw m6, m7
2914 packuswb m5, m6
2915
2916 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
2917 pmulhrsw m6, m7
2918 pmaddubsw m0, [r3 + 14 * 16] ; [30]
2919 pmulhrsw m0, m7
2920 packuswb m6, m0
2921
2922 palignr m2, m3, 8
2923
2924 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
2925 pmulhrsw m1, m7
2926 pmaddubsw m2, [r3] ; [16]
2927 pmulhrsw m2, m7
2928 packuswb m1, m2
2929
2930 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2931
2932 lea r0, [r6 + r1 * 4]
2933 lea r6, [r6 + r1 * 8]
2934 add r2, 8
2935 dec r4
2936 jnz .loop
2937
2938 RET
2939
2940 INIT_XMM sse4
2941 cglobal intra_pred_ang16_29, 3,7,8
2942 mov r2, r3mp
2943 lea r3, [ang_table + 16 * 16]
2944 mov r4d, 2
2945 lea r5, [r1 * 3] ; r5 -> 3 * stride
2946 mov r6, r0
2947 mova m7, [pw_1024]
2948
2949 .loop:
2950 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2951 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2952 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2953 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2954
2955 pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
2956 pmulhrsw m4, m7
2957 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
2958 pmulhrsw m0, m7
2959 packuswb m4, m0
2960
2961 palignr m1, m2, m3, 2
2962
2963 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
2964 pmulhrsw m5, m7
2965 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
2966 pmulhrsw m6, m7
2967 packuswb m5, m6
2968
2969 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
2970 pmulhrsw m6, m7
2971 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
2972 pmulhrsw m0, m7
2973 packuswb m6, m0
2974
2975 palignr m0, m2, m3, 4
2976
2977 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2978 pmulhrsw m1, m7
2979 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2980 pmulhrsw m0, m7
2981 packuswb m1, m0
2982
2983 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2984
2985 palignr m1, m2, m3, 4
2986
2987 pmaddubsw m4, m1, [r3 + 16] ; [17]
2988 pmulhrsw m4, m7
2989 pmaddubsw m1, [r3 + 10 * 16] ; [26]
2990 pmulhrsw m1, m7
2991 packuswb m4, m1
2992
2993 palignr m0, m2, m3, 6
2994
2995 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
2996 pmulhrsw m5, m7
2997 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
2998 pmulhrsw m6, m7
2999 packuswb m5, m6
3000
3001 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
3002 pmulhrsw m6, m7
3003 pmaddubsw m0, [r3 + 14 * 16] ; [30]
3004 pmulhrsw m0, m7
3005 packuswb m6, m0
3006
3007 palignr m2, m3, 8
3008
3009 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
3010 pmulhrsw m1, m7
3011 pmaddubsw m2, [r3] ; [16]
3012 pmulhrsw m2, m7
3013 packuswb m1, m2
3014
3015 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3016
3017 lea r0, [r6 + 8]
3018 add r2, 8
3019 dec r4
3020 jnz .loop
3021
3022 RET
3023
3024 INIT_XMM sse4
3025 cglobal intra_pred_ang16_8, 3,7,8
3026
3027 lea r3, [ang_table + 16 * 16]
3028 mov r4d, 2
3029 lea r5, [r1 * 3] ; r5 -> 3 * stride
3030 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3031 mova m7, [pw_1024]
3032
3033 .loop:
3034 movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3035 palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3036 punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3037 punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3038
3039 pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
3040 pmulhrsw m4, m7
3041 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
3042 pmulhrsw m2, m7
3043 packuswb m4, m2
3044
3045 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
3046 pmulhrsw m5, m7
3047 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
3048 pmulhrsw m6, m7
3049 packuswb m5, m6
3050
3051 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
3052 pmulhrsw m6, m7
3053 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
3054 pmulhrsw m2, m7
3055 packuswb m6, m2
3056
3057 palignr m2, m0, m1, 2
3058 palignr m3, m0, m1, 4
3059
3060 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
3061 pmulhrsw m1, m7
3062 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
3063 pmulhrsw m0, m7
3064 packuswb m1, m0
3065
3066 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3067
3068 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
3069 pmulhrsw m4, m7
3070 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
3071 pmulhrsw m5, m7
3072 packuswb m4, m5
3073
3074 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
3075 pmulhrsw m5, m7
3076 pmaddubsw m2, [r3 + 12 * 16] ; [28]
3077 pmulhrsw m2, m7
3078 packuswb m5, m2
3079
3080 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
3081 pmulhrsw m6, m7
3082 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
3083 pmulhrsw m1, m7
3084 packuswb m6, m1
3085
3086 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
3087 pmulhrsw m1, m7
3088 pmaddubsw m3, [r3] ; [16]
3089 pmulhrsw m3, m7
3090 packuswb m1, m3
3091
3092 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3093
3094 lea r0, [r6 + r1 * 4]
3095 lea r6, [r6 + r1 * 8]
3096 add r2, 8
3097 dec r4
3098 jnz .loop
3099
3100 RET
3101
3102 INIT_XMM sse4
3103 cglobal intra_pred_ang16_28, 3,7,8
3104 mov r2, r3mp
3105 lea r3, [ang_table + 16 * 16]
3106 mov r4d, 2
3107 lea r5, [r1 * 3] ; r5 -> 3 * stride
3108 mov r6, r0
3109 mova m7, [pw_1024]
3110
3111 .loop:
3112 movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3113 palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3114 punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3115 punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3116
3117 pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
3118 pmulhrsw m4, m7
3119 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
3120 pmulhrsw m2, m7
3121 packuswb m4, m2
3122
3123 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
3124 pmulhrsw m5, m7
3125 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
3126 pmulhrsw m6, m7
3127 packuswb m5, m6
3128
3129 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
3130 pmulhrsw m6, m7
3131 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
3132 pmulhrsw m2, m7
3133 packuswb m6, m2
3134
3135 palignr m2, m0, m1, 2
3136 palignr m3, m0, m1, 4
3137
3138 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
3139 pmulhrsw m1, m7
3140 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
3141 pmulhrsw m0, m7
3142 packuswb m1, m0
3143
3144 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3145
3146 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
3147 pmulhrsw m4, m7
3148 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
3149 pmulhrsw m5, m7
3150 packuswb m4, m5
3151
3152 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
3153 pmulhrsw m5, m7
3154 pmaddubsw m2, [r3 + 12 * 16] ; [28]
3155 pmulhrsw m2, m7
3156 packuswb m5, m2
3157
3158 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
3159 pmulhrsw m6, m7
3160 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
3161 pmulhrsw m1, m7
3162 packuswb m6, m1
3163
3164 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
3165 pmulhrsw m1, m7
3166 pmaddubsw m3, [r3] ; [16]
3167 pmulhrsw m3, m7
3168 packuswb m1, m3
3169
3170 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3171
3172 lea r0, [r6 + 8]
3173 add r2, 8
3174 dec r4
3175 jnz .loop
3176
3177 RET
3178
3179 INIT_XMM sse4
3180 cglobal intra_pred_ang16_9, 3,7,8
3181
3182 lea r3, [ang_table + 16 * 16]
3183 mov r4d, 2
3184 lea r5, [r1 * 3] ; r5 -> 3 * stride
3185 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3186 mova m7, [pw_1024]
3187
3188 .loop:
3189 movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3190 palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3191 punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3192
3193 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
3194 pmulhrsw m4, m7
3195 pmaddubsw m0, m2, [r3 - 12 * 16] ; [4]
3196 pmulhrsw m0, m7
3197 packuswb m4, m0
3198
3199 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
3200 pmulhrsw m5, m7
3201 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
3202 pmulhrsw m6, m7
3203 packuswb m5, m6
3204
3205 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
3206 pmulhrsw m6, m7
3207 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
3208 pmulhrsw m0, m7
3209 packuswb m6, m0
3210
3211 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
3212 pmulhrsw m1, m7
3213 pmaddubsw m0, m2, [r3] ; [16]
3214 pmulhrsw m0, m7
3215 packuswb m1, m0
3216
3217 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3218
3219 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
3220 pmulhrsw m4, m7
3221 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
3222 pmulhrsw m5, m7
3223 packuswb m4, m5
3224
3225 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
3226 pmulhrsw m5, m7
3227 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
3228 pmulhrsw m6, m7
3229 packuswb m5, m6
3230
3231 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
3232 pmulhrsw m6, m7
3233 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
3234 pmulhrsw m1, m7
3235 packuswb m6, m1
3236
3237 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
3238 pmulhrsw m1, m7
3239 packuswb m1, m1
3240
3241 punpcklqdq m1, m3 ; [00]
3242
3243 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3244
3245 lea r0, [r6 + r1 * 4]
3246 lea r6, [r6 + r1 * 8]
3247 add r2, 8
3248 dec r4
3249 jnz .loop
3250
3251 RET
3252
3253 INIT_XMM sse4
3254 cglobal intra_pred_ang16_27, 3,7,8
3255 mov r2, r3mp
3256 lea r3, [ang_table + 16 * 16]
3257 mov r4d, 2
3258 lea r5, [r1 * 3] ; r5 -> 3 * stride
3259 mov r6, r0
3260 mova m7, [pw_1024]
3261
3262 .loop:
3263 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3264 palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3265 punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3266
3267 pmaddubsw m4, m3, [r3 - 14 * 16] ; [2]
3268 pmulhrsw m4, m7
3269 pmaddubsw m0, m3, [r3 - 12 * 16] ; [4]
3270 pmulhrsw m0, m7
3271 packuswb m4, m0
3272
3273 pmaddubsw m5, m3, [r3 - 10 * 16] ; [6]
3274 pmulhrsw m5, m7
3275 pmaddubsw m6, m3, [r3 - 8 * 16] ; [8]
3276 pmulhrsw m6, m7
3277 packuswb m5, m6
3278
3279 pmaddubsw m6, m3, [r3 - 6 * 16] ; [10]
3280 pmulhrsw m6, m7
3281 pmaddubsw m0, m3, [r3 - 4 * 16] ; [12]
3282 pmulhrsw m0, m7
3283 packuswb m6, m0
3284
3285 pmaddubsw m1, m3, [r3 - 2 * 16] ; [14]
3286 pmulhrsw m1, m7
3287 pmaddubsw m0, m3, [r3] ; [16]
3288 pmulhrsw m0, m7
3289 packuswb m1, m0
3290
3291 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3292
3293 pmaddubsw m4, m3, [r3 + 2 * 16] ; [18]
3294 pmulhrsw m4, m7
3295 pmaddubsw m5, m3, [r3 + 4 * 16] ; [20]
3296 pmulhrsw m5, m7
3297 packuswb m4, m5
3298
3299 pmaddubsw m5, m3, [r3 + 6 * 16] ; [22]
3300 pmulhrsw m5, m7
3301 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
3302 pmulhrsw m6, m7
3303 packuswb m5, m6
3304
3305 pmaddubsw m6, m3, [r3 + 10 * 16] ; [26]
3306 pmulhrsw m6, m7
3307 pmaddubsw m1, m3, [r3 + 12 * 16] ; [28]
3308 pmulhrsw m1, m7
3309 packuswb m6, m1
3310
3311 pmaddubsw m1, m3, [r3 + 14 * 16] ; [30]
3312 pmulhrsw m1, m7
3313 packuswb m1, m1
3314
3315 movh [r0 ], m4
3316 movhps [r0 + r1 ], m4
3317 movh [r0 + r1 * 2], m5
3318 movhps [r0 + r5 ], m5
3319 lea r0, [r0 + r1 * 4]
3320 movh [r0 ], m6
3321 movhps [r0 + r1 ], m6
3322 movh [r0 + r1 * 2], m1
3323 movh [r0 + r5 ], m2
3324
3325 lea r0, [r6 + 8]
3326 add r2, 8
3327 dec r4
3328 jnz .loop
3329
3330 RET
3331
3332 INIT_XMM sse4
3333 cglobal intra_pred_ang16_10, 6,6,8
3334 lea r4, [r1 * 3]
3335 pxor m7, m7
3336
3337 movu m0, [r2 + 1]
3338 palignr m1, m0, 1
3339 pshufb m1, m7
3340 palignr m2, m0, 2
3341 pshufb m2, m7
3342 palignr m3, m0, 3
3343 pshufb m3, m7
3344 palignr m4, m0, 4
3345 pshufb m4, m7
3346 palignr m5, m0, 5
3347 pshufb m5, m7
3348 palignr m6, m0, 6
3349 pshufb m6, m7
3350
3351 movu [r0 + r1], m1
3352 movu [r0 + r1 * 2], m2
3353 movu [r0 + r4], m3
3354 lea r2, [r0 + r1 * 4]
3355 movu [r2], m4
3356 movu [r2 + r1], m5
3357 movu [r2 + r1 * 2], m6
3358
3359 palignr m1, m0, 7
3360 pshufb m1, m7
3361 movhlps m2, m0
3362 pshufb m2, m7
3363 palignr m3, m0, 9
3364 pshufb m3, m7
3365 palignr m4, m0, 10
3366 pshufb m4, m7
3367 palignr m5, m0, 11
3368 pshufb m5, m7
3369 palignr m6, m0, 12
3370 pshufb m6, m7
3371
3372 movu [r2 + r4], m1
3373 lea r2, [r2 + r1 * 4]
3374 movu [r2], m2
3375 movu [r2 + r1], m3
3376 movu [r2 + r1 * 2], m4
3377 movu [r2 + r4], m5
3378 lea r2, [r2 + r1 * 4]
3379 movu [r2], m6
3380
3381 palignr m1, m0, 13
3382 pshufb m1, m7
3383 palignr m2, m0, 14
3384 pshufb m2, m7
3385 palignr m3, m0, 15
3386 pshufb m3, m7
3387 pshufb m0, m7
3388
3389 movu [r2 + r1], m1
3390 movu [r2 + r1 * 2], m2
3391 movu [r2 + r4], m3
3392
3393 ; filter
3394 cmp r5w, byte 0
3395 jz .quit
3396 pmovzxbw m0, m0
3397 mova m1, m0
3398 movu m2, [r3]
3399 movu m3, [r3 + 1]
3400
3401 pshufb m2, m7
3402 pmovzxbw m2, m2
3403 movhlps m4, m3
3404 pmovzxbw m3, m3
3405 pmovzxbw m4, m4
3406 psubw m3, m2
3407 psubw m4, m2
3408 psraw m3, 1
3409 psraw m4, 1
3410 paddw m0, m3
3411 paddw m1, m4
3412 packuswb m0, m1
3413
3414 .quit:
3415 movu [r0], m0
3416
3417 RET
3418
3419 INIT_XMM sse4
3420 %if ARCH_X86_64 == 1
3421 cglobal intra_pred_ang16_26, 4,8,5
3422 mov r7, r5mp
3423 %define bfilter r7w
3424 %else
3425 cglobal intra_pred_ang16_26, 6,7,5,0 - 4
3426 %define bfilter dword[rsp]
3427 mov bfilter, r5
3428 %endif
3429 movu m0, [r3 + 1]
3430
3431 lea r4, [r1 * 3]
3432 lea r3, [r0 + r1 * 4]
3433 lea r5, [r3 + r1 * 4]
3434 lea r6, [r5 + r1 * 4]
3435
3436 movu [r0], m0
3437 movu [r0 + r1], m0
3438 movu [r0 + r1 * 2], m0
3439 movu [r0 + r4], m0
3440 movu [r3], m0
3441 movu [r3 + r1], m0
3442 movu [r3 + r1 * 2], m0
3443 movu [r3 + r4], m0
3444 movu [r5], m0
3445 movu [r5 + r1], m0
3446 movu [r5 + r1 * 2], m0
3447 movu [r5 + r4], m0
3448
3449 movu [r6], m0
3450 movu [r6 + r1], m0
3451 movu [r6 + r1 * 2], m0
3452 movu [r6 + r4], m0
3453
3454 ; filter
3455 cmp bfilter, byte 0
3456 jz .quit
3457
3458 pxor m4, m4
3459 pshufb m0, m4
3460 pmovzxbw m0, m0
3461 mova m1, m0
3462 movu m2, [r2]
3463 movu m3, [r2 + 1]
3464
3465 pshufb m2, m4
3466 pmovzxbw m2, m2
3467 movhlps m4, m3
3468 pmovzxbw m3, m3
3469 pmovzxbw m4, m4
3470 psubw m3, m2
3471 psubw m4, m2
3472 psraw m3, 1
3473 psraw m4, 1
3474 paddw m0, m3
3475 paddw m1, m4
3476 packuswb m0, m1
3477
3478 pextrb [r0], m0, 0
3479 pextrb [r0 + r1], m0, 1
3480 pextrb [r0 + r1 * 2], m0, 2
3481 pextrb [r0 + r4], m0, 3
3482 pextrb [r3], m0, 4
3483 pextrb [r3 + r1], m0, 5
3484 pextrb [r3 + r1 * 2], m0, 6
3485 pextrb [r3 + r4], m0, 7
3486 pextrb [r5], m0, 8
3487 pextrb [r5 + r1], m0, 9
3488 pextrb [r5 + r1 * 2], m0, 10
3489 pextrb [r5 + r4], m0, 11
3490 pextrb [r6], m0, 12
3491 pextrb [r6 + r1], m0, 13
3492 pextrb [r6 + r1 * 2], m0, 14
3493 pextrb [r6 + r4], m0, 15
3494
3495 .quit:
3496 RET
3497
3498 INIT_XMM sse4
3499 cglobal intra_pred_ang16_11, 3,7,8
3500
3501 lea r3, [ang_table + 16 * 16]
3502 mov r4d, 2
3503 lea r5, [r1 * 3] ; r5 -> 3 * stride
3504 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3505 mova m7, [pw_1024]
3506
3507 .loop:
3508 movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3509 mova m2, m3
3510 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3511 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3512
3513 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
3514 pmulhrsw m4, m7
3515 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
3516 pmulhrsw m0, m7
3517 packuswb m4, m0
3518
3519 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
3520 pmulhrsw m5, m7
3521 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
3522 pmulhrsw m6, m7
3523 packuswb m5, m6
3524
3525 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
3526 pmulhrsw m6, m7
3527 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
3528 pmulhrsw m0, m7
3529 packuswb m6, m0
3530
3531 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
3532 pmulhrsw m1, m7
3533 pmaddubsw m0, m3, [r3] ; [16]
3534 pmulhrsw m0, m7
3535 packuswb m1, m0
3536
3537 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3538
3539 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
3540 pmulhrsw m4, m7
3541 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
3542 pmulhrsw m5, m7
3543 packuswb m4, m5
3544
3545 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
3546 pmulhrsw m5, m7
3547 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
3548 pmulhrsw m6, m7
3549 packuswb m5, m6
3550
3551 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
3552 pmulhrsw m6, m7
3553 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
3554 pmulhrsw m1, m7
3555 packuswb m6, m1
3556
3557 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
3558 pmulhrsw m1, m7
3559 packuswb m1, m1
3560 punpcklqdq m1, m2 ;[00]
3561
3562 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3563
3564 lea r0, [r6 + r1 * 4]
3565 lea r6, [r6 + r1 * 8]
3566 add r2, 8
3567 dec r4
3568 jnz .loop
3569
3570 RET
3571
3572 INIT_XMM sse4
3573 cglobal intra_pred_ang16_25, 3,7,8
3574 mov r2, r3mp
3575 lea r3, [ang_table + 16 * 16]
3576 mov r4d, 2
3577 lea r5, [r1 * 3] ; r5 -> 3 * stride
3578 mov r6, r0
3579 mova m7, [pw_1024]
3580
3581 .loop:
3582 movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3583 mova m2, m3
3584 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3585 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3586
3587 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
3588 pmulhrsw m4, m7
3589 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
3590 pmulhrsw m0, m7
3591 packuswb m4, m0
3592
3593 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
3594 pmulhrsw m5, m7
3595 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
3596 pmulhrsw m6, m7
3597 packuswb m5, m6
3598
3599 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
3600 pmulhrsw m6, m7
3601 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
3602 pmulhrsw m0, m7
3603 packuswb m6, m0
3604
3605 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
3606 pmulhrsw m1, m7
3607 pmaddubsw m0, m3, [r3] ; [16]
3608 pmulhrsw m0, m7
3609 packuswb m1, m0
3610
3611 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3612
3613 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
3614 pmulhrsw m4, m7
3615 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
3616 pmulhrsw m5, m7
3617 packuswb m4, m5
3618
3619 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
3620 pmulhrsw m5, m7
3621 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
3622 pmulhrsw m6, m7
3623 packuswb m5, m6
3624
3625 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
3626 pmulhrsw m6, m7
3627 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
3628 pmulhrsw m1, m7
3629 packuswb m6, m1
3630
3631 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
3632 pmulhrsw m1, m7
3633 packuswb m1, m1
3634
3635 movh [r0 ], m4
3636 movhps [r0 + r1 ], m4
3637 movh [r0 + r1 * 2], m5
3638 movhps [r0 + r5 ], m5
3639 lea r0, [r0 + r1 * 4]
3640 movh [r0 ], m6
3641 movhps [r0 + r1 ], m6
3642 movh [r0 + r1 * 2], m1
3643 movh [r0 + r5 ], m2
3644
3645 lea r0, [r6 + 8]
3646 add r2, 8
3647 dec r4
3648 jnz .loop
3649
3650 RET
3651
3652 INIT_XMM sse4
3653 cglobal intra_pred_ang16_12, 4,7,8
3654
3655 lea r4, [ang_table + 16 * 16]
3656 lea r5, [r1 * 3] ; r5 -> 3 * stride
3657 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3658 mova m7, [pw_1024]
3659
3660 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3661 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
3662 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
3663 movu m2, [r3]
3664 pshufb m2, [c_mode16_12]
3665
3666 palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3667
3668 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
3669 pmulhrsw m4, m7
3670 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
3671 pmulhrsw m1, m7
3672 packuswb m4, m1
3673
3674 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
3675 pmulhrsw m5, m7
3676 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
3677 pmulhrsw m6, m7
3678 packuswb m5, m6
3679
3680 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
3681 pmulhrsw m6, m7
3682 pmaddubsw m0, [r4 - 14 * 16] ; [2]
3683 pmulhrsw m0, m7
3684 packuswb m6, m0
3685
3686 palignr m3, m2, 15
3687
3688 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3689 pmulhrsw m1, m7
3690 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3691 pmulhrsw m0, m7
3692 packuswb m1, m0
3693
3694 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3695
3696 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3697 pmulhrsw m4, m7
3698 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3699 pmulhrsw m5, m7
3700 packuswb m4, m5
3701
3702 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3703 pmulhrsw m5, m7
3704 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3705 pmulhrsw m6, m7
3706 packuswb m5, m6
3707
3708 palignr m3, m2, 14
3709
3710 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3711 pmulhrsw m6, m7
3712 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3713 pmulhrsw m1, m7
3714 packuswb m6, m1
3715
3716 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3717 pmulhrsw m1, m7
3718 pmaddubsw m3, [r4] ; [16]
3719 pmulhrsw m3, m7
3720 packuswb m1, m3
3721
3722 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3723
3724 lea r0, [r6 + r1 * 4]
3725 lea r6, [r6 + r1 * 8]
3726
3727 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3728 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
3729 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
3730 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
3731
3732 pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
3733 pmulhrsw m4, m7
3734 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
3735 pmulhrsw m5, m7
3736 packuswb m4, m5
3737
3738 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
3739 pmulhrsw m5, m7
3740 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
3741 pmulhrsw m6, m7
3742 packuswb m5, m6
3743
3744 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
3745 pmulhrsw m6, m7
3746 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
3747 pmulhrsw m0, m7
3748 packuswb m6, m0
3749
3750 palignr m3, m2, 14
3751
3752 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3753 pmulhrsw m1, m7
3754 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3755 pmulhrsw m0, m7
3756 packuswb m1, m0
3757
3758 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3759
3760 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3761 pmulhrsw m4, m7
3762 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3763 pmulhrsw m5, m7
3764 packuswb m4, m5
3765
3766 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3767 pmulhrsw m5, m7
3768 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3769 pmulhrsw m6, m7
3770 packuswb m5, m6
3771
3772 pslldq m2, 1
3773 palignr m3, m2, 14
3774
3775 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3776 pmulhrsw m6, m7
3777 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3778 pmulhrsw m1, m7
3779 packuswb m6, m1
3780
3781 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3782 pmulhrsw m1, m7
3783 pmaddubsw m3, [r4] ; [16]
3784 pmulhrsw m3, m7
3785 packuswb m1, m3
3786
3787 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3788
3789 RET
3790
3791 INIT_XMM sse4
3792 cglobal intra_pred_ang16_24, 4,7,8
3793
3794 lea r4, [ang_table + 16 * 16]
3795 lea r5, [r1 * 3] ; r5 -> 3 * stride
3796 mov r6, r0
3797 mova m7, [pw_1024]
3798
3799 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3800 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
3801 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
3802 movu m2, [r2]
3803 pshufb m2, [c_mode16_12]
3804
3805 palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3806
3807 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
3808 pmulhrsw m4, m7
3809 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
3810 pmulhrsw m1, m7
3811 packuswb m4, m1
3812
3813 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
3814 pmulhrsw m5, m7
3815 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
3816 pmulhrsw m6, m7
3817 packuswb m5, m6
3818
3819 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
3820 pmulhrsw m6, m7
3821 pmaddubsw m0, [r4 - 14 * 16] ; [2]
3822 pmulhrsw m0, m7
3823 packuswb m6, m0
3824
3825 palignr m3, m2, 15
3826
3827 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3828 pmulhrsw m1, m7
3829 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3830 pmulhrsw m0, m7
3831 packuswb m1, m0
3832
3833 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3834
3835 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3836 pmulhrsw m4, m7
3837 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3838 pmulhrsw m5, m7
3839 packuswb m4, m5
3840
3841 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3842 pmulhrsw m5, m7
3843 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3844 pmulhrsw m6, m7
3845 packuswb m5, m6
3846
3847 palignr m3, m2, 14
3848
3849 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3850 pmulhrsw m6, m7
3851 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3852 pmulhrsw m1, m7
3853 packuswb m6, m1
3854
3855 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3856 pmulhrsw m1, m7
3857 pmaddubsw m3, [r4] ; [16]
3858 pmulhrsw m3, m7
3859 packuswb m1, m3
3860
3861 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3862
3863 lea r0, [r6 + 8]
3864
3865 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3866 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
3867 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
3868 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
3869
3870 pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
3871 pmulhrsw m4, m7
3872 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
3873 pmulhrsw m5, m7
3874 packuswb m4, m5
3875
3876 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
3877 pmulhrsw m5, m7
3878 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
3879 pmulhrsw m6, m7
3880 packuswb m5, m6
3881
3882 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
3883 pmulhrsw m6, m7
3884 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
3885 pmulhrsw m0, m7
3886 packuswb m6, m0
3887
3888 palignr m3, m2, 14
3889
3890 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3891 pmulhrsw m1, m7
3892 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3893 pmulhrsw m0, m7
3894 packuswb m1, m0
3895
3896 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3897
3898 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3899 pmulhrsw m4, m7
3900 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3901 pmulhrsw m5, m7
3902 packuswb m4, m5
3903
3904 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3905 pmulhrsw m5, m7
3906 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3907 pmulhrsw m6, m7
3908 packuswb m5, m6
3909
3910 pslldq m2, 1
3911 palignr m3, m2, 14
3912
3913 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3914 pmulhrsw m6, m7
3915 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3916 pmulhrsw m1, m7
3917 packuswb m6, m1
3918
3919 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3920 pmulhrsw m1, m7
3921 pmaddubsw m3, [r4] ; [16]
3922 pmulhrsw m3, m7
3923 packuswb m1, m3
3924
3925 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3926
3927 RET
3928
3929 INIT_XMM sse4
3930 cglobal intra_pred_ang16_13, 4,7,8
3931
3932 lea r4, [ang_table + 16 * 16]
3933 lea r5, [r1 * 3] ; r5 -> 3 * stride
3934 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3935 mova m7, [pw_1024]
3936
3937 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3938 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
3939 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
3940 movu m2, [r3]
3941 pshufb m2, [c_mode16_13]
3942
3943 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3944
3945 pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
3946 pmulhrsw m4, m7
3947 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
3948 pmulhrsw m0, m7
3949 packuswb m4, m0
3950
3951 pmaddubsw m5, [r4 - 11 * 16] ; [05]
3952 pmulhrsw m5, m7
3953
3954 palignr m3, m2, 15
3955
3956 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
3957 pmulhrsw m6, m7
3958 packuswb m5, m6
3959
3960 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
3961 pmulhrsw m6, m7
3962 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
3963 pmulhrsw m0, m7
3964 packuswb m6, m0
3965
3966 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
3967 pmulhrsw m1, m7
3968
3969 palignr m3, m2, 14
3970
3971 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3972 pmulhrsw m0, m7
3973 packuswb m1, m0
3974
3975 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3976
3977 pmaddubsw m4, m3, [r4 - 16] ; [15]
3978 pmulhrsw m4, m7
3979 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
3980 pmulhrsw m5, m7
3981 packuswb m4, m5
3982
3983 pslldq m2, 1
3984 palignr m3, m2, 14
3985
3986 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
3987 pmulhrsw m5, m7
3988 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
3989 pmulhrsw m6, m7
3990 packuswb m5, m6
3991
3992 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
3993 pmulhrsw m6, m7
3994 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
3995 pmulhrsw m1, m7
3996 packuswb m6, m1
3997
3998 pslldq m2, 1
3999 palignr m3, m2, 14
4000
4001 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4002 pmulhrsw m1, m7
4003 pmaddubsw m3, [r4] ; [16]
4004 pmulhrsw m3, m7
4005 packuswb m1, m3
4006
4007 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4008
4009 lea r0, [r6 + r1 * 4]
4010 lea r6, [r6 + r1 * 8]
4011
4012 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4013 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4014 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4015 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4016
4017 pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
4018 pmulhrsw m4, m7
4019 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
4020 pmulhrsw m5, m7
4021 packuswb m4, m5
4022
4023 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4024 pmulhrsw m5, m7
4025
4026 palignr m3, m2, 14
4027
4028 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4029 pmulhrsw m6, m7
4030 packuswb m5, m6
4031
4032 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
4033 pmulhrsw m6, m7
4034 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
4035 pmulhrsw m0, m7
4036 packuswb m6, m0
4037
4038 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4039 pmulhrsw m1, m7
4040
4041 pslldq m2, 1
4042 palignr m3, m2, 14
4043
4044 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4045 pmulhrsw m0, m7
4046 packuswb m1, m0
4047
4048 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4049
4050 pmaddubsw m4, m3, [r4 - 16] ; [15]
4051 pmulhrsw m4, m7
4052 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4053 pmulhrsw m5, m7
4054 packuswb m4, m5
4055
4056 pslldq m2, 1
4057 palignr m3, m2, 14
4058
4059 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
4060 pmulhrsw m5, m7
4061 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4062 pmulhrsw m6, m7
4063 packuswb m5, m6
4064
4065 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4066 pmulhrsw m6, m7
4067 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
4068 pmulhrsw m1, m7
4069 packuswb m6, m1
4070
4071 pslldq m2, 1
4072 palignr m3, m2, 14
4073
4074 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4075 pmulhrsw m1, m7
4076 pmaddubsw m3, [r4] ; [16]
4077 pmulhrsw m3, m7
4078 packuswb m1, m3
4079
4080 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4081
4082 RET
4083
4084 INIT_XMM sse4
4085 cglobal intra_pred_ang16_23, 4,7,8
4086
4087 lea r4, [ang_table + 16 * 16]
4088 lea r5, [r1 * 3] ; r5 -> 3 * stride
4089 mov r6, r0
4090 mova m7, [pw_1024]
4091
4092 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4093 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4094 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4095 movu m2, [r2]
4096 pshufb m2, [c_mode16_13]
4097
4098 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4099
4100 pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
4101 pmulhrsw m4, m7
4102 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
4103 pmulhrsw m0, m7
4104 packuswb m4, m0
4105
4106 pmaddubsw m5, [r4 - 11 * 16] ; [05]
4107 pmulhrsw m5, m7
4108
4109 palignr m3, m2, 15
4110
4111 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4112 pmulhrsw m6, m7
4113 packuswb m5, m6
4114
4115 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
4116 pmulhrsw m6, m7
4117 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
4118 pmulhrsw m0, m7
4119 packuswb m6, m0
4120
4121 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4122 pmulhrsw m1, m7
4123
4124 palignr m3, m2, 14
4125
4126 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4127 pmulhrsw m0, m7
4128 packuswb m1, m0
4129
4130 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4131
4132 pmaddubsw m4, m3, [r4 - 16] ; [15]
4133 pmulhrsw m4, m7
4134 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4135 pmulhrsw m5, m7
4136 packuswb m4, m5
4137
4138 pslldq m2, 1
4139 palignr m3, m2, 14
4140
4141 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
4142 pmulhrsw m5, m7
4143 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4144 pmulhrsw m6, m7
4145 packuswb m5, m6
4146
4147 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4148 pmulhrsw m6, m7
4149 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
4150 pmulhrsw m1, m7
4151 packuswb m6, m1
4152
4153 pslldq m2, 1
4154 palignr m3, m2, 14
4155
4156 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4157 pmulhrsw m1, m7
4158 pmaddubsw m3, [r4] ; [16]
4159 pmulhrsw m3, m7
4160 packuswb m1, m3
4161
4162 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4163
4164 lea r0, [r6 + 8]
4165
4166 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4167 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4168 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4169 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4170
4171 pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
4172 pmulhrsw m4, m7
4173 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
4174 pmulhrsw m5, m7
4175 packuswb m4, m5
4176
4177 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4178 pmulhrsw m5, m7
4179
4180 palignr m3, m2, 14
4181
4182 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4183 pmulhrsw m6, m7
4184 packuswb m5, m6
4185
4186 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
4187 pmulhrsw m6, m7
4188 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
4189 pmulhrsw m0, m7
4190 packuswb m6, m0
4191
4192 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4193 pmulhrsw m1, m7
4194
4195 pslldq m2, 1
4196 palignr m3, m2, 14
4197
4198 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4199 pmulhrsw m0, m7
4200 packuswb m1, m0
4201
4202 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4203
4204 pmaddubsw m4, m3, [r4 - 16] ; [15]
4205 pmulhrsw m4, m7
4206 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4207 pmulhrsw m5, m7
4208 packuswb m4, m5
4209
4210 pslldq m2, 1
4211 palignr m3, m2, 14
4212
4213 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
4214 pmulhrsw m5, m7
4215 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4216 pmulhrsw m6, m7
4217 packuswb m5, m6
4218
4219 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4220 pmulhrsw m6, m7
4221 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
4222 pmulhrsw m1, m7
4223 packuswb m6, m1
4224
4225 pslldq m2, 1
4226 palignr m3, m2, 14
4227
4228 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4229 pmulhrsw m1, m7
4230 pmaddubsw m3, [r4] ; [16]
4231 pmulhrsw m3, m7
4232 packuswb m1, m3
4233
4234 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4235
4236 RET
4237
4238 INIT_XMM sse4
4239 cglobal intra_pred_ang16_14, 4,7,8
4240
4241 lea r4, [ang_table + 16 * 16]
4242 lea r5, [r1 * 3] ; r5 -> 3 * stride
4243 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4244 mova m7, [pw_1024]
4245
4246 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4247 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4248 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4249 movu m2, [r3]
4250 pshufb m2, [c_mode16_14]
4251
4252 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4253
4254 pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
4255 pmulhrsw m4, m7
4256 pmaddubsw m5, [r4 - 10 * 16] ; [06]
4257 pmulhrsw m5, m7
4258 packuswb m4, m5
4259
4260 palignr m3, m2, 15
4261
4262 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4263 pmulhrsw m5, m7
4264 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4265 pmulhrsw m6, m7
4266 packuswb m5, m6
4267
4268 palignr m3, m2, 14
4269
4270 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4271 pmulhrsw m6, m7
4272 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4273 pmulhrsw m0, m7
4274 packuswb m6, m0
4275
4276 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4277 pmulhrsw m1, m7
4278
4279 pslldq m2, 1
4280 palignr m3, m2, 14
4281
4282 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4283 pmulhrsw m0, m7
4284 packuswb m1, m0
4285
4286 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4287
4288 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4289 pmulhrsw m4, m7
4290
4291 pslldq m2, 1
4292 palignr m3, m2, 14
4293
4294 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4295 pmulhrsw m5, m7
4296 packuswb m4, m5
4297
4298 pmaddubsw m5, m3, [r4 + 16] ; [17]
4299 pmulhrsw m5, m7
4300 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4301 pmulhrsw m6, m7
4302 packuswb m5, m6
4303
4304 pslldq m2, 1
4305 palignr m3, m2, 14
4306
4307 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4308 pmulhrsw m6, m7
4309 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4310 pmulhrsw m1, m7
4311 packuswb m6, m1
4312
4313 pslldq m2, 1
4314 palignr m3, m2, 14
4315
4316 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4317 pmulhrsw m1, m7
4318 pmaddubsw m3, [r4] ; [16]
4319 pmulhrsw m3, m7
4320 packuswb m1, m3
4321
4322 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4323
4324 lea r0, [r6 + r1 * 4]
4325 lea r6, [r6 + r1 * 8]
4326
4327 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4328 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4329 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4330 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4331
4332 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
4333 pmulhrsw m4, m7
4334 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4335 pmulhrsw m5, m7
4336 packuswb m4, m5
4337
4338 palignr m3, m2, 14
4339
4340 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4341 pmulhrsw m5, m7
4342 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4343 pmulhrsw m6, m7
4344 packuswb m5, m6
4345
4346 pslldq m2, 1
4347 palignr m3, m2, 14
4348
4349 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4350 pmulhrsw m6, m7
4351 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4352 pmulhrsw m0, m7
4353 packuswb m6, m0
4354
4355 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4356 pmulhrsw m1, m7
4357
4358 pslldq m2, 1
4359 palignr m3, m2, 14
4360
4361 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4362 pmulhrsw m0, m7
4363 packuswb m1, m0
4364
4365 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4366
4367 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4368 pmulhrsw m4, m7
4369
4370 pslldq m2, 1
4371 palignr m3, m2, 14
4372
4373 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4374 pmulhrsw m5, m7
4375 packuswb m4, m5
4376
4377 pmaddubsw m5, m3, [r4 + 16] ; [17]
4378 pmulhrsw m5, m7
4379 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4380 pmulhrsw m6, m7
4381 packuswb m5, m6
4382
4383 pslldq m2, 1
4384 palignr m3, m2, 14
4385
4386 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4387 pmulhrsw m6, m7
4388 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4389 pmulhrsw m1, m7
4390 packuswb m6, m1
4391
4392 pslldq m2, 1
4393 palignr m3, m2, 14
4394
4395 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4396 pmulhrsw m1, m7
4397 pmaddubsw m3, [r4] ; [16]
4398 pmulhrsw m3, m7
4399 packuswb m1, m3
4400
4401 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4402
4403 RET
4404
4405 INIT_XMM sse4
4406 cglobal intra_pred_ang16_22, 4,7,8
4407
4408 lea r4, [ang_table + 16 * 16]
4409 lea r5, [r1 * 3] ; r5 -> 3 * stride
4410 mov r6, r0
4411 mova m7, [pw_1024]
4412
4413 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4414 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4415 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4416 movu m2, [r2]
4417 pshufb m2, [c_mode16_14]
4418
4419 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4420
4421 pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
4422 pmulhrsw m4, m7
4423 pmaddubsw m5, [r4 - 10 * 16] ; [06]
4424 pmulhrsw m5, m7
4425 packuswb m4, m5
4426
4427 palignr m3, m2, 15
4428
4429 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4430 pmulhrsw m5, m7
4431 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4432 pmulhrsw m6, m7
4433 packuswb m5, m6
4434
4435 palignr m3, m2, 14
4436
4437 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4438 pmulhrsw m6, m7
4439 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4440 pmulhrsw m0, m7
4441 packuswb m6, m0
4442
4443 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4444 pmulhrsw m1, m7
4445
4446 pslldq m2, 1
4447 palignr m3, m2, 14
4448
4449 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4450 pmulhrsw m0, m7
4451 packuswb m1, m0
4452
4453 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4454
4455 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4456 pmulhrsw m4, m7
4457
4458 pslldq m2, 1
4459 palignr m3, m2, 14
4460
4461 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4462 pmulhrsw m5, m7
4463 packuswb m4, m5
4464
4465 pmaddubsw m5, m3, [r4 + 16] ; [17]
4466 pmulhrsw m5, m7
4467 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4468 pmulhrsw m6, m7
4469 packuswb m5, m6
4470
4471 pslldq m2, 1
4472 palignr m3, m2, 14
4473
4474 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4475 pmulhrsw m6, m7
4476 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4477 pmulhrsw m1, m7
4478 packuswb m6, m1
4479
4480 pslldq m2, 1
4481 palignr m3, m2, 14
4482
4483 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4484 pmulhrsw m1, m7
4485 pmaddubsw m3, [r4] ; [16]
4486 pmulhrsw m3, m7
4487 packuswb m1, m3
4488
4489 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4490
4491 lea r0, [r6 + 8]
4492
4493 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4494 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4495 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4496 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4497
4498 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
4499 pmulhrsw m4, m7
4500 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4501 pmulhrsw m5, m7
4502 packuswb m4, m5
4503
4504 palignr m3, m2, 14
4505
4506 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4507 pmulhrsw m5, m7
4508 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4509 pmulhrsw m6, m7
4510 packuswb m5, m6
4511
4512 pslldq m2, 1
4513 palignr m3, m2, 14
4514
4515 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4516 pmulhrsw m6, m7
4517 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4518 pmulhrsw m0, m7
4519 packuswb m6, m0
4520
4521 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4522 pmulhrsw m1, m7
4523
4524 pslldq m2, 1
4525 palignr m3, m2, 14
4526
4527 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4528 pmulhrsw m0, m7
4529 packuswb m1, m0
4530
4531 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4532
4533 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4534 pmulhrsw m4, m7
4535
4536 pslldq m2, 1
4537 palignr m3, m2, 14
4538
4539 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4540 pmulhrsw m5, m7
4541 packuswb m4, m5
4542
4543 pmaddubsw m5, m3, [r4 + 16] ; [17]
4544 pmulhrsw m5, m7
4545 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4546 pmulhrsw m6, m7
4547 packuswb m5, m6
4548
4549 pslldq m2, 1
4550 palignr m3, m2, 14
4551
4552 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4553 pmulhrsw m6, m7
4554 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4555 pmulhrsw m1, m7
4556 packuswb m6, m1
4557
4558 pslldq m2, 1
4559 palignr m3, m2, 14
4560
4561 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4562 pmulhrsw m1, m7
4563 pmaddubsw m3, [r4] ; [16]
4564 pmulhrsw m3, m7
4565 packuswb m1, m3
4566
4567 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4568
4569 RET
4570
4571 INIT_XMM sse4
4572 cglobal intra_pred_ang16_15, 4,7,8
4573
4574 lea r4, [ang_table + 16 * 16]
4575 lea r5, [r1 * 3] ; r5 -> 3 * stride
4576 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4577 mova m7, [pw_1024]
4578
4579 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4580 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4581 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4582 movu m2, [r3]
4583 pshufb m2, [c_mode16_15]
4584
4585 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4586
4587 pmaddubsw m4, [r4 - 16] ; [15]
4588 pmulhrsw m4, m7
4589
4590 palignr m3, m2, 15
4591
4592 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4593 pmulhrsw m5, m7
4594 packuswb m4, m5
4595
4596 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4597 pmulhrsw m5, m7
4598
4599 palignr m3, m2, 14
4600
4601 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4602 pmulhrsw m6, m7
4603 packuswb m5, m6
4604
4605 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4606 pmulhrsw m6, m7
4607
4608 pslldq m2, 1
4609 palignr m3, m2, 14
4610
4611 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4612 pmulhrsw m0, m7
4613 packuswb m6, m0
4614
4615 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4616 pmulhrsw m1, m7
4617
4618 pslldq m2, 1
4619 palignr m3, m2, 14
4620
4621 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4622 pmulhrsw m0, m7
4623 packuswb m1, m0
4624
4625 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4626
4627 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4628 pmulhrsw m4, m7
4629
4630 pslldq m2, 1
4631 palignr m3, m2, 14
4632
4633 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4634 pmulhrsw m5, m7
4635 packuswb m4, m5
4636
4637 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4638 pmulhrsw m5, m7
4639
4640 pslldq m2, 1
4641 palignr m3, m2, 14
4642
4643 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4644 pmulhrsw m6, m7
4645 packuswb m5, m6
4646
4647 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4648 pmulhrsw m6, m7
4649
4650 pslldq m2, 1
4651 palignr m3, m2, 14
4652
4653 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4654 pmulhrsw m1, m7
4655 packuswb m6, m1
4656
4657 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4658 pmulhrsw m1, m7
4659
4660 pslldq m2, 1
4661 palignr m3, m2, 14
4662
4663 pmaddubsw m3, [r4] ; [16]
4664 pmulhrsw m3, m7
4665 packuswb m1, m3
4666
4667 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4668
4669 lea r0, [r6 + r1 * 4]
4670 lea r6, [r6 + r1 * 8]
4671
4672 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4673 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4674 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4675 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
4676
4677 pmaddubsw m4, m3, [r4 - 16] ; [15]
4678 pmulhrsw m4, m7
4679
4680 palignr m3, m2, 14
4681
4682 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4683 pmulhrsw m5, m7
4684 packuswb m4, m5
4685
4686 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4687 pmulhrsw m5, m7
4688
4689 pslldq m2, 1
4690 palignr m3, m2, 14
4691
4692 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4693 pmulhrsw m6, m7
4694 packuswb m5, m6
4695
4696 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4697 pmulhrsw m6, m7
4698
4699 pslldq m2, 1
4700 palignr m3, m2, 14
4701
4702 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4703 pmulhrsw m0, m7
4704 packuswb m6, m0
4705
4706 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4707 pmulhrsw m1, m7
4708
4709 pslldq m2, 1
4710 palignr m3, m2, 14
4711
4712 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4713 pmulhrsw m0, m7
4714 packuswb m1, m0
4715
4716 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4717
4718 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4719 pmulhrsw m4, m7
4720
4721 pslldq m2, 1
4722 palignr m3, m2, 14
4723
4724 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4725 pmulhrsw m5, m7
4726 packuswb m4, m5
4727
4728 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4729 pmulhrsw m5, m7
4730
4731 pslldq m2, 1
4732 palignr m3, m2, 14
4733
4734 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4735 pmulhrsw m6, m7
4736 packuswb m5, m6
4737
4738 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4739 pmulhrsw m6, m7
4740
4741 pslldq m2, 1
4742 palignr m3, m2, 14
4743
4744 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4745 pmulhrsw m1, m7
4746 packuswb m6, m1
4747
4748 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4749 pmulhrsw m1, m7
4750
4751 pslldq m2, 1
4752 palignr m3, m2, 14
4753
4754 pmaddubsw m3, [r4] ; [16]
4755 pmulhrsw m3, m7
4756 packuswb m1, m3
4757
4758 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4759
4760 RET
4761
4762 INIT_XMM sse4
4763 cglobal intra_pred_ang16_21, 4,7,8
4764
4765 lea r4, [ang_table + 16 * 16]
4766 lea r5, [r1 * 3] ; r5 -> 3 * stride
4767 mov r6, r0
4768 mova m7, [pw_1024]
4769
4770 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4771 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4772 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4773 movu m2, [r2]
4774 pshufb m2, [c_mode16_15]
4775
4776 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4777
4778 pmaddubsw m4, [r4 - 16] ; [15]
4779 pmulhrsw m4, m7
4780
4781 palignr m3, m2, 15
4782
4783 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4784 pmulhrsw m5, m7
4785 packuswb m4, m5
4786
4787 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4788 pmulhrsw m5, m7
4789
4790 palignr m3, m2, 14
4791
4792 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4793 pmulhrsw m6, m7
4794 packuswb m5, m6
4795
4796 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4797 pmulhrsw m6, m7
4798
4799 pslldq m2, 1
4800 palignr m3, m2, 14
4801
4802 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4803 pmulhrsw m0, m7
4804 packuswb m6, m0
4805
4806 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4807 pmulhrsw m1, m7
4808
4809 pslldq m2, 1
4810 palignr m3, m2, 14
4811
4812 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4813 pmulhrsw m0, m7
4814 packuswb m1, m0
4815
4816 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4817
4818 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4819 pmulhrsw m4, m7
4820
4821 pslldq m2, 1
4822 palignr m3, m2, 14
4823
4824 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4825 pmulhrsw m5, m7
4826 packuswb m4, m5
4827
4828 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4829 pmulhrsw m5, m7
4830
4831 pslldq m2, 1
4832 palignr m3, m2, 14
4833
4834 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4835 pmulhrsw m6, m7
4836 packuswb m5, m6
4837
4838 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4839 pmulhrsw m6, m7
4840
4841 pslldq m2, 1
4842 palignr m3, m2, 14
4843
4844 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4845 pmulhrsw m1, m7
4846 packuswb m6, m1
4847
4848 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4849 pmulhrsw m1, m7
4850
4851 pslldq m2, 1
4852 palignr m3, m2, 14
4853
4854 pmaddubsw m3, [r4] ; [16]
4855 pmulhrsw m3, m7
4856 packuswb m1, m3
4857
4858 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4859
4860 lea r0, [r6 + 8]
4861
4862 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4863 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4864 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4865 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
4866
4867 pmaddubsw m4, m3, [r4 - 16] ; [15]
4868 pmulhrsw m4, m7
4869
4870 palignr m3, m2, 14
4871
4872 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4873 pmulhrsw m5, m7
4874 packuswb m4, m5
4875
4876 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4877 pmulhrsw m5, m7
4878
4879 pslldq m2, 1
4880 palignr m3, m2, 14
4881
4882 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4883 pmulhrsw m6, m7
4884 packuswb m5, m6
4885
4886 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4887 pmulhrsw m6, m7
4888
4889 pslldq m2, 1
4890 palignr m3, m2, 14
4891
4892 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4893 pmulhrsw m0, m7
4894 packuswb m6, m0
4895
4896 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4897 pmulhrsw m1, m7
4898
4899 pslldq m2, 1
4900 palignr m3, m2, 14
4901
4902 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4903 pmulhrsw m0, m7
4904 packuswb m1, m0
4905
4906 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4907
4908 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4909 pmulhrsw m4, m7
4910
4911 pslldq m2, 1
4912 palignr m3, m2, 14
4913
4914 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4915 pmulhrsw m5, m7
4916 packuswb m4, m5
4917
4918 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4919 pmulhrsw m5, m7
4920
4921 pslldq m2, 1
4922 palignr m3, m2, 14
4923
4924 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4925 pmulhrsw m6, m7
4926 packuswb m5, m6
4927
4928 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4929 pmulhrsw m6, m7
4930
4931 pslldq m2, 1
4932 palignr m3, m2, 14
4933
4934 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4935 pmulhrsw m1, m7
4936 packuswb m6, m1
4937
4938 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4939 pmulhrsw m1, m7
4940
4941 pslldq m2, 1
4942 palignr m3, m2, 14
4943
4944 pmaddubsw m3, [r4] ; [16]
4945 pmulhrsw m3, m7
4946 packuswb m1, m3
4947
4948 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4949
4950 RET
4951
4952 INIT_XMM sse4
4953 cglobal intra_pred_ang16_16, 4,7,8
4954
4955 lea r4, [ang_table + 16 * 16]
4956 lea r5, [r1 * 3] ; r5 -> 3 * stride
4957 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4958 mova m7, [pw_1024]
4959
4960 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4961 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4962 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4963 movu m2, [r3]
4964 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
4965 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4966
4967 pmaddubsw m4, [r4 - 5 * 16] ; [11]
4968 pmulhrsw m4, m7
4969
4970 palignr m3, m2, 15
4971
4972 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4973 pmulhrsw m5, m7
4974 packuswb m4, m5
4975
4976 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
4977 pmulhrsw m5, m7
4978
4979 palignr m3, m2, 14
4980
4981 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4982 pmulhrsw m6, m7
4983 packuswb m5, m6
4984
4985 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
4986 palignr m3, m2, 14
4987
4988 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4989 pmulhrsw m6, m7
4990 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
4991 pmulhrsw m0, m7
4992 packuswb m6, m0
4993
4994 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
4995 palignr m3, m2, 14
4996
4997 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
4998 pmulhrsw m1, m7
4999
5000 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
5001 palignr m3, m2, 14
5002
5003 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5004 pmulhrsw m0, m7
5005 packuswb m1, m0
5006
5007 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5008
5009 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5010 pmulhrsw m4, m7
5011
5012 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
5013 palignr m3, m2, 14
5014
5015 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5016 pmulhrsw m5, m7
5017 packuswb m4, m5
5018
5019 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
5020 palignr m3, m2, 14
5021
5022 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5023 pmulhrsw m5, m7
5024 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5025 pmulhrsw m6, m7
5026 packuswb m5, m6
5027
5028 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
5029 palignr m3, m2, 14
5030
5031 pmaddubsw m6, m3, [r4 - 16] ; [15]
5032 pmulhrsw m6, m7
5033
5034 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
5035 palignr m3, m2, 14
5036
5037 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5038 pmulhrsw m1, m7
5039 packuswb m6, m1
5040
5041 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5042 pmulhrsw m1, m7
5043
5044 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
5045 palignr m3, m2, 14
5046
5047 pmaddubsw m3, [r4] ; [16]
5048 pmulhrsw m3, m7
5049 packuswb m1, m3
5050
5051 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5052
5053 lea r0, [r6 + r1 * 4]
5054 lea r6, [r6 + r1 * 8]
5055
5056 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5057 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5058 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5059 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
5060 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
5061
5062 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
5063 pmulhrsw m4, m7
5064
5065 palignr m3, m2, 14
5066
5067 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
5068 pmulhrsw m5, m7
5069 packuswb m4, m5
5070
5071 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
5072 pmulhrsw m5, m7
5073
5074 pslldq m2, 1
5075 palignr m3, m2, 14
5076
5077 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
5078 pmulhrsw m6, m7
5079 packuswb m5, m6
5080
5081 pslldq m2, 1
5082 palignr m3, m2, 14
5083
5084 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
5085 pmulhrsw m6, m7
5086
5087 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
5088 pmulhrsw m0, m7
5089 packuswb m6, m0
5090
5091 pslldq m2, 1
5092 palignr m3, m2, 14
5093
5094 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
5095 pmulhrsw m1, m7
5096
5097 pslldq m2, 1
5098 palignr m3, m2, 14
5099
5100 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5101 pmulhrsw m0, m7
5102 packuswb m1, m0
5103
5104 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5105
5106 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5107 pmulhrsw m4, m7
5108
5109 pslldq m2, 1
5110 palignr m3, m2, 14
5111
5112 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5113 pmulhrsw m5, m7
5114 packuswb m4, m5
5115
5116 pslldq m2, 1
5117 palignr m3, m2, 14
5118
5119 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5120 pmulhrsw m5, m7
5121 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5122 pmulhrsw m6, m7
5123 packuswb m5, m6
5124
5125 pslldq m2, 1
5126 palignr m3, m2, 14
5127
5128 pmaddubsw m6, m3, [r4 - 16] ; [15]
5129 pmulhrsw m6, m7
5130
5131 pslldq m2, 1
5132 palignr m3, m2, 14
5133
5134 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5135 pmulhrsw m1, m7
5136 packuswb m6, m1
5137
5138 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5139 pmulhrsw m1, m7
5140
5141 pslldq m2, 1
5142 palignr m3, m2, 14
5143
5144 pmaddubsw m3, [r4] ; [16]
5145 pmulhrsw m3, m7
5146 packuswb m1, m3
5147
5148 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5149
5150 RET
5151
5152 INIT_XMM sse4
5153 cglobal intra_pred_ang16_20, 4,7,8
5154
5155 lea r4, [ang_table + 16 * 16]
5156 lea r5, [r1 * 3] ; r5 -> 3 * stride
5157 mov r6, r0
5158 mova m7, [pw_1024]
5159
5160 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
5161 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
5162 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
5163 movu m2, [r2]
5164 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
5165 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
5166
5167 pmaddubsw m4, [r4 - 5 * 16] ; [11]
5168 pmulhrsw m4, m7
5169
5170 palignr m3, m2, 15
5171
5172 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
5173 pmulhrsw m5, m7
5174 packuswb m4, m5
5175
5176 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
5177 pmulhrsw m5, m7
5178
5179 palignr m3, m2, 14
5180
5181 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
5182 pmulhrsw m6, m7
5183 packuswb m5, m6
5184
5185 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
5186 palignr m3, m2, 14
5187
5188 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
5189 pmulhrsw m6, m7
5190 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
5191 pmulhrsw m0, m7
5192 packuswb m6, m0
5193
5194 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
5195 palignr m3, m2, 14
5196
5197 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
5198 pmulhrsw m1, m7
5199
5200 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
5201 palignr m3, m2, 14
5202
5203 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5204 pmulhrsw m0, m7
5205 packuswb m1, m0
5206
5207 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5208
5209 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5210 pmulhrsw m4, m7
5211
5212 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
5213 palignr m3, m2, 14
5214
5215 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5216 pmulhrsw m5, m7
5217 packuswb m4, m5
5218
5219 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
5220 palignr m3, m2, 14
5221
5222 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5223 pmulhrsw m5, m7
5224 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5225 pmulhrsw m6, m7
5226 packuswb m5, m6
5227
5228 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
5229 palignr m3, m2, 14
5230
5231 pmaddubsw m6, m3, [r4 - 16] ; [15]
5232 pmulhrsw m6, m7
5233
5234 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
5235 palignr m3, m2, 14
5236
5237 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5238 pmulhrsw m1, m7
5239 packuswb m6, m1
5240
5241 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5242 pmulhrsw m1, m7
5243
5244 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
5245 palignr m3, m2, 14
5246
5247 pmaddubsw m3, [r4] ; [16]
5248 pmulhrsw m3, m7
5249 packuswb m1, m3
5250
5251 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5252
5253 lea r0, [r6 + 8]
5254
5255 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5256 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5257 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5258 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
5259 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
5260
5261 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
5262 pmulhrsw m4, m7
5263
5264 palignr m3, m2, 14
5265
5266 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
5267 pmulhrsw m5, m7
5268 packuswb m4, m5
5269
5270 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
5271 pmulhrsw m5, m7
5272
5273 pslldq m2, 1
5274 palignr m3, m2, 14
5275
5276 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
5277 pmulhrsw m6, m7
5278 packuswb m5, m6
5279
5280 pslldq m2, 1
5281 palignr m3, m2, 14
5282
5283 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
5284 pmulhrsw m6, m7
5285
5286 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
5287 pmulhrsw m0, m7
5288 packuswb m6, m0
5289
5290 pslldq m2, 1
5291 palignr m3, m2, 14
5292
5293 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
5294 pmulhrsw m1, m7
5295
5296 pslldq m2, 1
5297 palignr m3, m2, 14
5298
5299 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5300 pmulhrsw m0, m7
5301 packuswb m1, m0
5302
5303 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5304
5305 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5306 pmulhrsw m4, m7
5307
5308 pslldq m2, 1
5309 palignr m3, m2, 14
5310
5311 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5312 pmulhrsw m5, m7
5313 packuswb m4, m5
5314
5315 pslldq m2, 1
5316 palignr m3, m2, 14
5317
5318 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5319 pmulhrsw m5, m7
5320 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5321 pmulhrsw m6, m7
5322 packuswb m5, m6
5323
5324 pslldq m2, 1
5325 palignr m3, m2, 14
5326
5327 pmaddubsw m6, m3, [r4 - 16] ; [15]
5328 pmulhrsw m6, m7
5329
5330 pslldq m2, 1
5331 palignr m3, m2, 14
5332
5333 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5334 pmulhrsw m1, m7
5335 packuswb m6, m1
5336
5337 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5338 pmulhrsw m1, m7
5339
5340 pslldq m2, 1
5341 palignr m3, m2, 14
5342
5343 pmaddubsw m3, [r4] ; [16]
5344 pmulhrsw m3, m7
5345 packuswb m1, m3
5346
5347 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5348
5349 RET
5350
5351 INIT_XMM sse4
5352 cglobal intra_pred_ang16_17, 4,7,8
5353
5354 lea r4, [ang_table + 16 * 16]
5355 lea r5, [r1 * 3] ; r5 -> 3 * stride
5356 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
5357 mova m7, [pw_1024]
5358
5359 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
5360 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
5361 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
5362 movu m2, [r3]
5363 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
5364 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
5365
5366 pmaddubsw m4, [r4 - 10 * 16] ; [06]
5367 pmulhrsw m4, m7
5368
5369 palignr m3, m2, 15
5370
5371 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5372 pmulhrsw m5, m7
5373 packuswb m4, m5
5374
5375 palignr m3, m2, 14
5376
5377 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5378 pmulhrsw m5, m7
5379
5380 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
5381 pinsrb m2, [r3 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
5382 palignr m3, m2, 14
5383
5384 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5385 pmulhrsw m6, m7
5386 packuswb m5, m6
5387
5388 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
5389 palignr m3, m2, 14
5390
5391 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5392 pmulhrsw m6, m7
5393 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5394 pmulhrsw m0, m7
5395 packuswb m6, m0
5396
5397 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
5398 palignr m3, m2, 14
5399
5400 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5401 pmulhrsw m1, m7
5402
5403 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5404 palignr m3, m2, 14
5405
5406 pmaddubsw m0, m3, [r4] ; [16]
5407 pmulhrsw m0, m7
5408 packuswb m1, m0
5409
5410 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5411
5412 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
5413 palignr m3, m2, 14
5414
5415 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5416 pmulhrsw m4, m7
5417
5418 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
5419 palignr m3, m2, 14
5420
5421 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5422 pmulhrsw m5, m7
5423 packuswb m4, m5
5424
5425 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5426 pmulhrsw m5, m7
5427
5428 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
5429 palignr m3, m2, 14
5430
5431 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5432 pmulhrsw m6, m7
5433 packuswb m5, m6
5434
5435 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
5436 palignr m3, m2, 14
5437
5438 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5439 pmulhrsw m6, m7
5440
5441 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
5442 palignr m3, m2, 14
5443
5444 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5445 pmulhrsw m1, m7
5446 packuswb m6, m1
5447
5448 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
5449 palignr m3, m2, 14
5450
5451 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5452 pmulhrsw m1, m7
5453 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5454 pmulhrsw m3, m7
5455 packuswb m1, m3
5456
5457 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5458
5459 lea r0, [r6 + r1 * 4]
5460 lea r6, [r6 + r1 * 8]
5461
5462 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5463 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5464 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5465 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5466 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x]
5467
5468 pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
5469 pmulhrsw m4, m7
5470
5471 palignr m3, m2, 14
5472
5473 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5474 pmulhrsw m5, m7
5475 packuswb m4, m5
5476
5477 pslldq m2, 1
5478 palignr m3, m2, 14
5479
5480 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5481 pmulhrsw m5, m7
5482
5483 pslldq m2, 1
5484 palignr m3, m2, 14
5485
5486 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5487 pmulhrsw m6, m7
5488 packuswb m5, m6
5489
5490 pslldq m2, 1
5491 palignr m3, m2, 14
5492
5493 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5494 pmulhrsw m6, m7
5495 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5496 pmulhrsw m0, m7
5497 packuswb m6, m0
5498
5499 pslldq m2, 1
5500 palignr m3, m2, 14
5501
5502 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5503 pmulhrsw m1, m7
5504
5505 pslldq m2, 1
5506 palignr m3, m2, 14
5507
5508 pmaddubsw m0, m3, [r4] ; [16]
5509 pmulhrsw m0, m7
5510 packuswb m1, m0
5511
5512 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5513
5514 pslldq m2, 1
5515 palignr m3, m2, 14
5516
5517 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5518 pmulhrsw m4, m7
5519
5520 pslldq m2, 1
5521 palignr m3, m2, 14
5522
5523 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5524 pmulhrsw m5, m7
5525 packuswb m4, m5
5526
5527 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5528 pmulhrsw m5, m7
5529
5530 pslldq m2, 1
5531 palignr m3, m2, 14
5532
5533 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5534 pmulhrsw m6, m7
5535 packuswb m5, m6
5536
5537 pslldq m2, 1
5538 palignr m3, m2, 14
5539
5540 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5541 pmulhrsw m6, m7
5542
5543 pslldq m2, 1
5544 palignr m3, m2, 14
5545
5546 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5547 pmulhrsw m1, m7
5548 packuswb m6, m1
5549
5550 pslldq m2, 1
5551 palignr m3, m2, 14
5552
5553 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5554 pmulhrsw m1, m7
5555 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5556 pmulhrsw m3, m7
5557 packuswb m1, m3
5558
5559 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5560
5561 RET
5562
5563 INIT_XMM sse4
5564 cglobal intra_pred_ang16_19, 4,7,8
5565
5566 lea r4, [ang_table + 16 * 16]
5567 lea r5, [r1 * 3] ; r5 -> 3 * stride
5568 mov r6, r0
5569 mova m7, [pw_1024]
5570
5571 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
5572 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
5573 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
5574 movu m2, [r2]
5575 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
5576 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
5577
5578 pmaddubsw m4, [r4 - 10 * 16] ; [06]
5579 pmulhrsw m4, m7
5580
5581 palignr m3, m2, 15
5582
5583 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5584 pmulhrsw m5, m7
5585 packuswb m4, m5
5586
5587 palignr m3, m2, 14
5588
5589 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5590 pmulhrsw m5, m7
5591
5592 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
5593 pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
5594 palignr m3, m2, 14
5595
5596 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5597 pmulhrsw m6, m7
5598 packuswb m5, m6
5599
5600 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
5601 palignr m3, m2, 14
5602
5603 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5604 pmulhrsw m6, m7
5605 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5606 pmulhrsw m0, m7
5607 packuswb m6, m0
5608
5609 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
5610 palignr m3, m2, 14
5611
5612 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5613 pmulhrsw m1, m7
5614
5615 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5616 palignr m3, m2, 14
5617
5618 pmaddubsw m0, m3, [r4] ; [16]
5619 pmulhrsw m0, m7
5620 packuswb m1, m0
5621
5622 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5623
5624 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
5625 palignr m3, m2, 14
5626
5627 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5628 pmulhrsw m4, m7
5629
5630 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
5631 palignr m3, m2, 14
5632
5633 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5634 pmulhrsw m5, m7
5635 packuswb m4, m5
5636
5637 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5638 pmulhrsw m5, m7
5639
5640 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
5641 palignr m3, m2, 14
5642
5643 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5644 pmulhrsw m6, m7
5645 packuswb m5, m6
5646
5647 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
5648 palignr m3, m2, 14
5649
5650 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5651 pmulhrsw m6, m7
5652
5653 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
5654 palignr m3, m2, 14
5655
5656 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5657 pmulhrsw m1, m7
5658 packuswb m6, m1
5659
5660 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
5661 palignr m3, m2, 14
5662
5663 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5664 pmulhrsw m1, m7
5665 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5666 pmulhrsw m3, m7
5667 packuswb m1, m3
5668
5669 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5670
5671 lea r0, [r6 + 8]
5672
5673 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5674 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5675 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5676 palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5677 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
5678
5679 pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
5680 pmulhrsw m4, m7
5681
5682 palignr m3, m2, 14
5683
5684 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5685 pmulhrsw m5, m7
5686 packuswb m4, m5
5687
5688 pslldq m2, 1
5689 palignr m3, m2, 14
5690
5691 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5692 pmulhrsw m5, m7
5693
5694 pslldq m2, 1
5695 palignr m3, m2, 14
5696
5697 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5698 pmulhrsw m6, m7
5699 packuswb m5, m6
5700
5701 pslldq m2, 1
5702 palignr m3, m2, 14
5703
5704 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5705 pmulhrsw m6, m7
5706 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5707 pmulhrsw m0, m7
5708 packuswb m6, m0
5709
5710 pslldq m2, 1
5711 palignr m3, m2, 14
5712
5713 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5714 pmulhrsw m1, m7
5715
5716 pslldq m2, 1
5717 palignr m3, m2, 14
5718
5719 pmaddubsw m0, m3, [r4] ; [16]
5720 pmulhrsw m0, m7
5721 packuswb m1, m0
5722
5723 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5724
5725 pslldq m2, 1
5726 palignr m3, m2, 14
5727
5728 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5729 pmulhrsw m4, m7
5730
5731 pslldq m2, 1
5732 palignr m3, m2, 14
5733
5734 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5735 pmulhrsw m5, m7
5736 packuswb m4, m5
5737
5738 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5739 pmulhrsw m5, m7
5740
5741 pslldq m2, 1
5742 palignr m3, m2, 14
5743
5744 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5745 pmulhrsw m6, m7
5746 packuswb m5, m6
5747
5748 pslldq m2, 1
5749 palignr m3, m2, 14
5750
5751 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5752 pmulhrsw m6, m7
5753
5754 pslldq m2, 1
5755 palignr m3, m2, 14
5756
5757 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5758 pmulhrsw m1, m7
5759 packuswb m6, m1
5760
5761 pslldq m2, 1
5762 palignr m3, m2, 14
5763
5764 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5765 pmulhrsw m1, m7
5766 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5767 pmulhrsw m3, m7
5768 packuswb m1, m3
5769
5770 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5771
5772 RET
5773
5774 INIT_XMM sse4
5775 cglobal intra_pred_ang16_18, 4,5,3
5776
5777 movu m0, [r3]
5778 movu m1, [r2]
5779 mova m2, [c_mode16_18]
5780 pshufb m1, m2
5781
5782 lea r2, [r1 * 2]
5783 lea r3, [r1 * 3]
5784 lea r4, [r1 * 4]
5785 movu [r0], m0
5786 palignr m2, m0, m1, 15
5787 movu [r0 + r1], m2
5788 palignr m2, m0, m1, 14
5789 movu [r0 + r2], m2
5790 palignr m2, m0, m1, 13
5791 movu [r0 + r3], m2
5792 lea r0, [r0 + r4]
5793 palignr m2, m0, m1, 12
5794 movu [r0], m2
5795 palignr m2, m0, m1, 11
5796 movu [r0 + r1], m2
5797 palignr m2, m0, m1, 10
5798 movu [r0 + r2], m2
5799 palignr m2, m0, m1, 9
5800 movu [r0 + r3], m2
5801 lea r0, [r0 + r4]
5802 palignr m2, m0, m1, 8
5803 movu [r0], m2
5804 palignr m2, m0, m1, 7
5805 movu [r0 + r1], m2
5806 palignr m2, m0, m1, 6
5807 movu [r0 + r2], m2
5808 palignr m2, m0, m1, 5
5809 movu [r0 + r3], m2
5810 lea r0, [r0 + r4]
5811 palignr m2, m0, m1, 4
5812 movu [r0], m2
5813 palignr m2, m0, m1, 3
5814 movu [r0 + r1], m2
5815 palignr m2, m0, m1, 2
5816 movu [r0 + r2], m2
5817 palignr m0, m1, 1
5818 movu [r0 + r3], m0
5819 RET
5820
5821 ;---------------------------------------------------------------------------------------------------------------
5822 ; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
5823 ;---------------------------------------------------------------------------------------------------------------
5824 INIT_XMM ssse3
5825 cglobal intra_pred_ang32_2, 3,4,4
5826 cmp r4m, byte 34
5827 cmove r2, r3mp
5828 movu m0, [r2 + 2]
5829 movu m1, [r2 + 18]
5830 movu m3, [r2 + 34]
5831
5832 lea r3, [r1 * 3]
5833
5834 movu [r0], m0
5835 movu [r0 + 16], m1
5836 palignr m2, m1, m0, 1
5837 movu [r0 + r1], m2
5838 palignr m2, m3, m1, 1
5839 movu [r0 + r1 + 16], m2
5840 palignr m2, m1, m0, 2
5841 movu [r0 + r1 * 2], m2
5842 palignr m2, m3, m1, 2
5843 movu [r0 + r1 * 2 + 16], m2
5844 palignr m2, m1, m0, 3
5845 movu [r0 + r3], m2
5846 palignr m2, m3, m1, 3
5847 movu [r0 + r3 + 16], m2
5848
5849 lea r0, [r0 + r1 * 4]
5850
5851 palignr m2, m1, m0, 4
5852 movu [r0], m2
5853 palignr m2, m3, m1, 4
5854 movu [r0 + 16], m2
5855 palignr m2, m1, m0, 5
5856 movu [r0 + r1], m2
5857 palignr m2, m3, m1, 5
5858 movu [r0 + r1 + 16], m2
5859 palignr m2, m1, m0, 6
5860 movu [r0 + r1 * 2], m2
5861 palignr m2, m3, m1, 6
5862 movu [r0 + r1 * 2 + 16], m2
5863 palignr m2, m1, m0, 7
5864 movu [r0 + r3], m2
5865 palignr m2, m3, m1, 7
5866 movu [r0 + r3 + 16], m2
5867
5868 lea r0, [r0 + r1 * 4]
5869
5870 palignr m2, m1, m0, 8
5871 movu [r0], m2
5872 palignr m2, m3, m1, 8
5873 movu [r0 + 16], m2
5874 palignr m2, m1, m0, 9
5875 movu [r0 + r1], m2
5876 palignr m2, m3, m1, 9
5877 movu [r0 + r1 + 16], m2
5878 palignr m2, m1, m0, 10
5879 movu [r0 + r1 * 2], m2
5880 palignr m2, m3, m1, 10
5881 movu [r0 + r1 * 2 + 16], m2
5882 palignr m2, m1, m0, 11
5883 movu [r0 + r3], m2
5884 palignr m2, m3, m1, 11
5885 movu [r0 + r3 + 16], m2
5886
5887 lea r0, [r0 + r1 * 4]
5888
5889 palignr m2, m1, m0, 12
5890 movu [r0], m2
5891 palignr m2, m3, m1, 12
5892 movu [r0 + 16], m2
5893 palignr m2, m1, m0, 13
5894 movu [r0 + r1], m2
5895 palignr m2, m3, m1, 13
5896 movu [r0 + r1 + 16], m2
5897 palignr m2, m1, m0, 14
5898 movu [r0 + r1 * 2], m2
5899 palignr m2, m3, m1, 14
5900 movu [r0 + r1 * 2 + 16], m2
5901 palignr m2, m1, m0, 15
5902 movu [r0 + r3], m2
5903 palignr m2, m3, m1, 15
5904 movu [r0 + r3 + 16], m2
5905
5906 lea r0, [r0 + r1 * 4]
5907
5908 movu [r0], m1
5909 movu m0, [r2 + 50]
5910 movu [r0 + 16], m3
5911 palignr m2, m3, m1, 1
5912 movu [r0 + r1], m2
5913 palignr m2, m0, m3, 1
5914 movu [r0 + r1 + 16], m2
5915 palignr m2, m3, m1, 2
5916 movu [r0 + r1 * 2], m2
5917 palignr m2, m0, m3, 2
5918 movu [r0 + r1 * 2 + 16], m2
5919 palignr m2, m3, m1, 3
5920 movu [r0 + r3], m2
5921 palignr m2, m0, m3, 3
5922 movu [r0 + r3 + 16], m2
5923
5924 lea r0, [r0 + r1 * 4]
5925
5926 palignr m2, m3, m1, 4
5927 movu [r0], m2
5928 palignr m2, m0, m3, 4
5929 movu [r0 + 16], m2
5930 palignr m2, m3, m1, 5
5931 movu [r0 + r1], m2
5932 palignr m2, m0, m3, 5
5933 movu [r0 + r1 + 16], m2
5934 palignr m2, m3, m1, 6
5935 movu [r0 + r1 * 2], m2
5936 palignr m2, m0, m3, 6
5937 movu [r0 + r1 * 2 + 16], m2
5938 palignr m2, m3, m1, 7
5939 movu [r0 + r3], m2
5940 palignr m2, m0, m3, 7
5941 movu [r0 + r3 + 16], m2
5942
5943 lea r0, [r0 + r1 * 4]
5944
5945 palignr m2, m3, m1, 8
5946 movu [r0], m2
5947 palignr m2, m0, m3, 8
5948 movu [r0 + 16], m2
5949 palignr m2, m3, m1, 9
5950 movu [r0 + r1], m2
5951 palignr m2, m0, m3, 9
5952 movu [r0 + r1 + 16], m2
5953 palignr m2, m3, m1, 10
5954 movu [r0 + r1 * 2], m2
5955 palignr m2, m0, m3, 10
5956 movu [r0 + r1 * 2 + 16], m2
5957 palignr m2, m3, m1, 11
5958 movu [r0 + r3], m2
5959 palignr m2, m0, m3, 11
5960 movu [r0 + r3 + 16], m2
5961
5962 lea r0, [r0 + r1 * 4]
5963
5964 palignr m2, m3, m1, 12
5965 movu [r0], m2
5966 palignr m2, m0, m3, 12
5967 movu [r0 + 16], m2
5968 palignr m2, m3, m1, 13
5969 movu [r0 + r1], m2
5970 palignr m2, m0, m3, 13
5971 movu [r0 + r1 + 16], m2
5972 palignr m2, m3, m1, 14
5973 movu [r0 + r1 * 2], m2
5974 palignr m2, m0, m3, 14
5975 movu [r0 + r1 * 2 + 16], m2
5976 palignr m2, m3, m1, 15
5977 movu [r0 + r3], m2
5978 palignr m2, m0, m3, 15
5979 movu [r0 + r3 + 16], m2
5980 RET
5981
5982 ; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8
5983 %macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7
5984 %if %3 == 0
5985 %else
5986 pshufb m0, [r3]
5987 pmaddubsw m0, [r4 + %3 * 16]
5988 pmulhrsw m0, [pw_1024]
5989 %endif
5990 %if %4 == 0
5991 pmovzxbw m1, m1
5992 %else
5993 pshufb m1, [r3]
5994 pmaddubsw m1, [r4 + %4 * 16]
5995 pmulhrsw m1, [pw_1024]
5996 %endif
5997 %if %3 == 0
5998 packuswb m1, m1
5999 movlhps m0, m1
6000 %else
6001 packuswb m0, m1
6002 %endif
6003 mova m1, [pw_1024]
6004 %if %5 == 0
6005 %else
6006 pshufb m2, [r3]
6007 pmaddubsw m2, [r4 + %5 * 16]
6008 pmulhrsw m2, m1
6009 %endif
6010 %if %6 == 0
6011 pmovzxbw m3, m3
6012 %else
6013 pshufb m3, [r3]
6014 pmaddubsw m3, [r4 + %6 * 16]
6015 pmulhrsw m3, m1
6016 %endif
6017 %if %5 == 0
6018 packuswb m3, m3
6019 movlhps m2, m3
6020 %else
6021 packuswb m2, m3
6022 %endif
6023 %if %7 == 0
6024 %else
6025 pshufb m4, [r3]
6026 pmaddubsw m4, [r4 + %7 * 16]
6027 pmulhrsw m4, m1
6028 %endif
6029 %if %8 == 0
6030 pmovzxbw m5, m5
6031 %else
6032 pshufb m5, [r3]
6033 pmaddubsw m5, [r4 + %8 * 16]
6034 pmulhrsw m5, m1
6035 %endif
6036 %if %7 == 0
6037 packuswb m5, m5
6038 movlhps m4, m5
6039 %else
6040 packuswb m4, m5
6041 %endif
6042 %if %9 == 0
6043 %else
6044 pshufb m6, [r3]
6045 pmaddubsw m6, [r4 + %9 * 16]
6046 pmulhrsw m6, m1
6047 %endif
6048 %if %10 == 0
6049 pmovzxbw m7, m7
6050 %else
6051 pshufb m7, [r3]
6052 pmaddubsw m7, [r4 + %10 * 16]
6053 pmulhrsw m7, m1
6054 %endif
6055 %if %9 == 0
6056 packuswb m7, m7
6057 movlhps m6, m7
6058 %else
6059 packuswb m6, m7
6060 %endif
6061
6062 %if %2 == 1
6063 ; transpose
6064 punpckhbw m1, m0, m2
6065 punpcklbw m0, m2
6066 punpckhbw m3, m0, m1
6067 punpcklbw m0, m1
6068
6069 punpckhbw m1, m4, m6
6070 punpcklbw m4, m6
6071 punpckhbw m6, m4, m1
6072 punpcklbw m4, m1
6073
6074 punpckhdq m2, m0, m4
6075 punpckldq m0, m4
6076 punpckldq m4, m3, m6
6077 punpckhdq m3, m6
6078
6079 movh [r0 + + %1 * 8], m0
6080 movhps [r0 + r1 + %1 * 8], m0
6081 movh [r0 + r1*2 + %1 * 8], m2
6082 movhps [r0 + r5 + %1 * 8], m2
6083 movh [r6 + %1 * 8], m4
6084 movhps [r6 + r1 + %1 * 8], m4
6085 movh [r6 + r1*2 + %1 * 8], m3
6086 movhps [r6 + r5 + %1 * 8], m3
6087 %else
6088 movh [r0 ], m0
6089 movhps [r0 + r1 ], m0
6090 movh [r0 + r1 * 2], m2
6091 movhps [r0 + r5 ], m2
6092 lea r0, [r0 + r1 * 4]
6093 movh [r0 ], m4
6094 movhps [r0 + r1 ], m4
6095 movh [r0 + r1 * 2], m6
6096 movhps [r0 + r5 ], m6
6097 %endif
6098 %endmacro
6099
6100 %macro MODE_3_33 1
6101 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
6102 palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
6103 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
6104 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
6105 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
6106 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
6107 pmulhrsw m4, m7
6108 pmaddubsw m1, [r3 + 4 * 16] ; [20]
6109 pmulhrsw m1, m7
6110 packuswb m4, m1
6111 palignr m5, m2, m0, 4
6112 pmaddubsw m5, [r3 - 2 * 16] ; [14]
6113 pmulhrsw m5, m7
6114 palignr m6, m2, m0, 6
6115 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
6116 pmulhrsw m6, m7
6117 packuswb m5, m6
6118 palignr m1, m2, m0, 8
6119 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
6120 pmulhrsw m6, m7
6121 pmaddubsw m1, [r3 + 12 * 16] ; [28]
6122 pmulhrsw m1, m7
6123 packuswb m6, m1
6124 palignr m1, m2, m0, 10
6125 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6126 pmulhrsw m1, m7
6127 palignr m2, m0, 12
6128 pmaddubsw m2, [r3] ; [16]
6129 pmulhrsw m2, m7
6130 packuswb m1, m2
6131
6132 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6133
6134 movu m0, [r2 + 8]
6135 palignr m1, m0, 1
6136 punpckhbw m2, m0, m1
6137 punpcklbw m0, m1
6138 palignr m5, m2, m0, 2
6139 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
6140 pmulhrsw m4, m7
6141 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
6142 pmulhrsw m1, m7
6143 packuswb m4, m1
6144 pmaddubsw m5, [r3 + 14 * 16] ; [30]
6145 pmulhrsw m5, m7
6146 palignr m6, m2, m0, 4
6147 pmaddubsw m6, [r3 + 8 * 16] ; [24]
6148 pmulhrsw m6, m7
6149 packuswb m5, m6
6150 palignr m1, m2, m0, 6
6151 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
6152 pmulhrsw m6, m7
6153 palignr m1, m2, m0, 8
6154 pmaddubsw m1, [r3 - 4 * 16] ; [12]
6155 pmulhrsw m1, m7
6156 packuswb m6, m1
6157 palignr m1, m2, m0, 10
6158 pmaddubsw m1, [r3 - 10 * 16] ; [06]
6159 pmulhrsw m1, m7
6160 packuswb m1, m1
6161 movhps m1, [r2 + 14] ; [00]
6162
6163 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6164
6165 movu m0, [r2 + 14]
6166 palignr m1, m0, 1
6167 punpckhbw m2, m0, m1
6168 punpcklbw m0, m1
6169 palignr m1, m2, m0, 2
6170 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
6171 pmulhrsw m4, m7
6172 pmaddubsw m1, [r3 + 4 * 16] ; [20]
6173 pmulhrsw m1, m7
6174 packuswb m4, m1
6175 palignr m5, m2, m0, 4
6176 pmaddubsw m5, [r3 - 2 * 16] ; [14]
6177 pmulhrsw m5, m7
6178 palignr m6, m2, m0, 6
6179 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
6180 pmulhrsw m6, m7
6181 packuswb m5, m6
6182 palignr m1, m2, m0, 8
6183 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
6184 pmulhrsw m6, m7
6185 pmaddubsw m1, [r3 + 12 * 16] ; [28]
6186 pmulhrsw m1, m7
6187 packuswb m6, m1
6188 palignr m1, m2, m0, 10
6189 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6190 pmulhrsw m1, m7
6191 palignr m2, m0, 12
6192 pmaddubsw m2, [r3] ; [16]
6193 pmulhrsw m2, m7
6194 packuswb m1, m2
6195
6196 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6197
6198 movu m0, [r2 + 21]
6199 palignr m1, m0, 1
6200 punpckhbw m2, m0, m1
6201 punpcklbw m0, m1
6202 palignr m5, m2, m0, 2
6203 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
6204 pmulhrsw m4, m7
6205 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
6206 pmulhrsw m1, m7
6207 packuswb m4, m1
6208 pmaddubsw m5, [r3 + 14 * 16] ; [30]
6209 pmulhrsw m5, m7
6210 palignr m6, m2, m0, 4
6211 pmaddubsw m6, [r3 + 8 * 16] ; [24]
6212 pmulhrsw m6, m7
6213 packuswb m5, m6
6214 palignr m1, m2, m0, 6
6215 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
6216 pmulhrsw m6, m7
6217 palignr m1, m2, m0, 8
6218 pmaddubsw m1, [r3 - 4 * 16] ; [12]
6219 pmulhrsw m1, m7
6220 packuswb m6, m1
6221 palignr m1, m2, m0, 10
6222 pmaddubsw m1, [r3 - 10 * 16] ; [06]
6223 pmulhrsw m1, m7
6224 packuswb m1, m1
6225 movhps m1, [r2 + 27] ; [00]
6226
6227 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6228 %endmacro
6229 ;------------------------------------------------------------------------------------------------------------------
6230 ; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6231 ;------------------------------------------------------------------------------------------------------------------
6232 INIT_XMM sse4
6233 cglobal intra_pred_ang32_3, 3,7,8
6234 lea r3, [ang_table + 16 * 16]
6235 mov r4d, 4
6236 lea r5, [r1 * 3] ; r5 -> 3 * stride
6237 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6238 mova m7, [pw_1024]
6239 .loop:
6240 MODE_3_33 1
6241 lea r0, [r6 + r1 * 4]
6242 lea r6, [r6 + r1 * 8]
6243 add r2, 8
6244 dec r4
6245 jnz .loop
6246 RET
6247
6248 %macro MODE_4_32 1
6249 movu m0, [r2 + 1]
6250 palignr m1, m0, 1
6251 punpckhbw m2, m0, m1
6252 punpcklbw m0, m1
6253 palignr m1, m2, m0, 2
6254 mova m5, m1
6255 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
6256 pmulhrsw m4, m7
6257 pmaddubsw m1, [r3 - 6 * 16] ; [10]
6258 pmulhrsw m1, m7
6259 packuswb m4, m1
6260 pmaddubsw m5, [r3 + 15 * 16] ; [31]
6261 pmulhrsw m5, m7
6262 palignr m6, m2, m0, 4
6263 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
6264 pmulhrsw m6, m7
6265 packuswb m5, m6
6266 palignr m1, m2, m0, 6
6267 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
6268 pmulhrsw m6, m7
6269 pmaddubsw m1, [r3 + 14 * 16] ; [30]
6270 pmulhrsw m1, m7
6271 packuswb m6, m1
6272 palignr m1, m2, m0, 8
6273 pmaddubsw m1, [r3 + 3 * 16] ; [19]
6274 pmulhrsw m1, m7
6275 palignr m2, m0, 10
6276 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
6277 pmulhrsw m3, m7
6278 packuswb m1, m3
6279
6280 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6281
6282 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
6283 pmulhrsw m4, m7
6284 movu m0, [r2 + 6]
6285 palignr m1, m0, 1
6286 punpckhbw m2, m0, m1
6287 punpcklbw m0, m1
6288 palignr m1, m2, m0, 2
6289 pmaddubsw m1, [r3 + 2 * 16] ; [18]
6290 pmulhrsw m1, m7
6291 packuswb m4, m1
6292 palignr m5, m2, m0, 4
6293 mova m6, m5
6294 pmaddubsw m5, [r3 - 9 * 16] ; [07]
6295 pmulhrsw m5, m7
6296 pmaddubsw m6, [r3 + 12 * 16] ; [28]
6297 pmulhrsw m6, m7
6298 packuswb m5, m6
6299 palignr m6, m2, m0, 6
6300 pmaddubsw m6, [r3 + 16] ; [17]
6301 pmulhrsw m6, m7
6302 palignr m1, m2, m0, 8
6303 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
6304 pmulhrsw m3, m7
6305 packuswb m6, m3
6306 pmaddubsw m1, [r3 + 11 * 16] ; [27]
6307 pmulhrsw m1, m7
6308 palignr m2, m0, 10
6309 pmaddubsw m2, [r3] ; [16]
6310 pmulhrsw m2, m7
6311 packuswb m1, m2
6312
6313 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6314
6315 movu m0, [r2 + 12]
6316 palignr m1, m0, 1
6317 punpckhbw m2, m0, m1
6318 punpcklbw m0, m1
6319 mova m1, m0
6320 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
6321 pmulhrsw m4, m7
6322 pmaddubsw m1, [r3 + 10 * 16] ; [26]
6323 pmulhrsw m1, m7
6324 packuswb m4, m1
6325 palignr m5, m2, m0, 2
6326 pmaddubsw m5, [r3 - 16] ; [15]
6327 pmulhrsw m5, m7
6328 palignr m6, m2, m0, 4
6329 mova m1, m6
6330 pmaddubsw m1, [r3 - 12 * 16] ; [4]
6331 pmulhrsw m1, m7
6332 packuswb m5, m1
6333 pmaddubsw m6, [r3 + 9 * 16] ; [25]
6334 pmulhrsw m6, m7
6335 palignr m1, m2, m0, 6
6336 pmaddubsw m1, [r3 - 2 * 16] ; [14]
6337 pmulhrsw m1, m7
6338 packuswb m6, m1
6339 palignr m1, m2, m0, 8
6340 mova m2, m1
6341 pmaddubsw m1, [r3 - 13 * 16] ; [3]
6342 pmulhrsw m1, m7
6343 pmaddubsw m2, [r3 + 8 * 16] ; [24]
6344 pmulhrsw m2, m7
6345 packuswb m1, m2
6346
6347 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6348
6349 movu m0, [r2 + 17]
6350 palignr m1, m0, 1
6351 punpckhbw m2, m0, m1
6352 punpcklbw m0, m1
6353 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
6354 pmulhrsw m4, m7
6355 palignr m5, m2, m0, 2
6356 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
6357 pmulhrsw m1, m7
6358 packuswb m4, m1
6359 pmaddubsw m5, [r3 + 7 * 16] ; [23]
6360 pmulhrsw m5, m7
6361 palignr m6, m2, m0, 4
6362 pmaddubsw m6, [r3 - 4 * 16] ; [12]
6363 pmulhrsw m6, m7
6364 packuswb m5, m6
6365 palignr m6, m2, m0, 6
6366 mova m1, m6
6367 pmaddubsw m6, [r3 - 15 * 16] ; [1]
6368 pmulhrsw m6, m7
6369 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6370 pmulhrsw m1, m7
6371 packuswb m6, m1
6372 palignr m1, m2, m0, 8
6373 pmaddubsw m1, [r3 - 5 * 16] ; [11]
6374 pmulhrsw m1, m7
6375 packuswb m1, m1
6376 movhps m1, [r2 + 22] ; [00]
6377
6378 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6379 %endmacro
6380 ;-----------------------------------------------------------------------------------------------------------------
6381 ; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6382 ;-----------------------------------------------------------------------------------------------------------------
6383 INIT_XMM sse4
6384 cglobal intra_pred_ang32_4, 3,7,8
6385 lea r3, [ang_table + 16 * 16]
6386 mov r4d, 4
6387 lea r5, [r1 * 3] ; r5 -> 3 * stride
6388 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6389 mova m7, [pw_1024]
6390 .loop:
6391 MODE_4_32 1
6392 lea r0, [r6 + r1 * 4]
6393 lea r6, [r6 + r1 * 8]
6394 add r2, 8
6395 dec r4
6396 jnz .loop
6397 RET
6398
6399 %macro MODE_5_31 1
6400 movu m0, [r2 + 1]
6401 palignr m1, m0, 1
6402 punpckhbw m2, m0, m1
6403 punpcklbw m0, m1
6404 palignr m1, m2, m0, 2
6405 mova m5, m1
6406 pmaddubsw m4, m0, [r3 + 16] ; [17]
6407 pmulhrsw m4, m7
6408 pmaddubsw m1, [r3 - 14 * 16] ; [2]
6409 pmulhrsw m1, m7
6410 packuswb m4, m1
6411 pmaddubsw m5, [r3 + 3 * 16] ; [19]
6412 pmulhrsw m5, m7
6413 palignr m6, m2, m0, 4
6414 mova m1, m6
6415 pmaddubsw m6, [r3 - 12 * 16] ; [4]
6416 pmulhrsw m6, m7
6417 packuswb m5, m6
6418 pmaddubsw m6, m1, [r3 + 5 * 16] ; [21]
6419 pmulhrsw m6, m7
6420 palignr m1, m2, m0, 6
6421 mova m3, m1
6422 pmaddubsw m3, [r3 - 10 * 16] ; [6]
6423 pmulhrsw m3, m7
6424 packuswb m6, m3
6425 pmaddubsw m1, [r3 + 7 * 16] ; [23]
6426 pmulhrsw m1, m7
6427 palignr m2, m0, 8
6428 pmaddubsw m2, [r3 - 8 * 16] ; [8]
6429 pmulhrsw m2, m7
6430 packuswb m1, m2
6431
6432 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6433
6434 movu m0, [r2 + 5]
6435 palignr m1, m0, 1
6436 punpckhbw m2, m0, m1
6437 punpcklbw m0, m1
6438 palignr m1, m2, m0, 2
6439 mova m5, m1
6440 pmaddubsw m4, m0, [r3 + 9 * 16] ; [25]
6441 pmulhrsw m4, m7
6442 pmaddubsw m1, [r3 - 6 * 16] ; [10]
6443 pmulhrsw m1, m7
6444 packuswb m4, m1
6445 pmaddubsw m5, [r3 + 11 * 16] ; [27]
6446 pmulhrsw m5, m7
6447 palignr m6, m2, m0, 4
6448 mova m1, m6
6449 pmaddubsw m6, [r3 - 4 * 16] ; [12]
6450 pmulhrsw m6, m7
6451 packuswb m5, m6
6452 pmaddubsw m6, m1, [r3 + 13 * 16] ; [29]
6453 pmulhrsw m6, m7
6454 palignr m1, m2, m0, 6
6455 mova m3, m1
6456 pmaddubsw m3, [r3 - 2 * 16] ; [14]
6457 pmulhrsw m3, m7
6458 packuswb m6, m3
6459 pmaddubsw m1, [r3 + 15 * 16] ; [31]
6460 pmulhrsw m1, m7
6461 palignr m2, m0, 8
6462 pmaddubsw m2, [r3] ; [16]
6463 pmulhrsw m2, m7
6464 packuswb m1, m2
6465
6466 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6467
6468 movu m0, [r2 + 10]
6469 palignr m1, m0, 1
6470 punpckhbw m2, m0, m1
6471 punpcklbw m0, m1
6472 mova m1, m0
6473 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
6474 pmulhrsw m4, m7
6475 pmaddubsw m1, [r3 + 2 * 16] ; [18]
6476 pmulhrsw m1, m7
6477 packuswb m4, m1
6478 palignr m5, m2, m0, 2
6479 mova m1, m5
6480 pmaddubsw m5, [r3 - 13 * 16] ; [3]
6481 pmulhrsw m5, m7
6482 pmaddubsw m1, [r3 + 4 * 16] ; [20]
6483 pmulhrsw m1, m7
6484 packuswb m5, m1
6485 palignr m1, m2, m0, 4
6486 pmaddubsw m6, m1, [r3 - 11 * 16] ; [5]
6487 pmulhrsw m6, m7
6488 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6489 pmulhrsw m1, m7
6490 packuswb m6, m1
6491 palignr m2, m0, 6
6492 pmaddubsw m1, m2, [r3 - 9 * 16] ; [7]
6493 pmulhrsw m1, m7
6494 pmaddubsw m2, [r3 + 8 * 16] ; [24]
6495 pmulhrsw m2, m7
6496 packuswb m1, m2
6497
6498 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6499
6500 movu m0, [r2 + 14]
6501 palignr m1, m0, 1
6502 punpckhbw m2, m0, m1
6503 punpcklbw m0, m1
6504 mova m1, m0
6505 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
6506 pmulhrsw m4, m7
6507 pmaddubsw m1, [r3 + 10 * 16] ; [26]
6508 pmulhrsw m1, m7
6509 packuswb m4, m1
6510 palignr m5, m2, m0, 2
6511 mova m1, m5
6512 pmaddubsw m5, [r3 - 5 * 16] ; [11]
6513 pmulhrsw m5, m7
6514 pmaddubsw m1, [r3 + 12 * 16] ; [28]
6515 pmulhrsw m1, m7
6516 packuswb m5, m1
6517 palignr m1, m2, m0, 4
6518 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
6519 pmulhrsw m6, m7
6520 pmaddubsw m1, [r3 + 14 * 16] ; [30]
6521 pmulhrsw m1, m7
6522 packuswb m6, m1
6523 palignr m2, m0, 6
6524 pmaddubsw m1, m2, [r3 - 16] ; [15]
6525 pmulhrsw m1, m7
6526 packuswb m1, m1
6527 movhps m1, [r2 + 18] ; [00]
6528
6529 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6530 %endmacro
6531 ;------------------------------------------------------------------------------------------------------------------
6532 ; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6533 ;------------------------------------------------------------------------------------------------------------------
6534 INIT_XMM sse4
6535 cglobal intra_pred_ang32_5, 3,7,8
6536 lea r3, [ang_table + 16 * 16]
6537 mov r4d, 4
6538 lea r5, [r1 * 3] ; r5 -> 3 * stride
6539 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6540 mova m7, [pw_1024]
6541 .loop:
6542 MODE_5_31 1
6543 lea r0, [r6 + r1 * 4]
6544 lea r6, [r6 + r1 * 8]
6545 add r2, 8
6546 dec r4
6547 jnz .loop
6548 RET
6549
6550 %macro MODE_6_30 1
6551 movu m0, [r2 + 1]
6552 palignr m1, m0, 1
6553 punpckhbw m2, m0, m1
6554 punpcklbw m0, m1
6555 mova m1, m0
6556 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
6557 pmulhrsw m4, m7
6558 pmaddubsw m1, [r3 + 10 * 16] ; [26]
6559 pmulhrsw m1, m7
6560 packuswb m4, m1
6561 palignr m6, m2, m0, 2
6562 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
6563 pmulhrsw m5, m7
6564 pmaddubsw m6, [r3 + 4 * 16] ; [20]
6565 pmulhrsw m6, m7
6566 packuswb m5, m6
6567 palignr m1, m2, m0, 4
6568 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
6569 pmulhrsw m6, m7
6570 pmaddubsw m3, m1, [r3 - 2 * 16] ; [14]
6571 pmulhrsw m3, m7
6572 packuswb m6, m3
6573 pmaddubsw m1, [r3 + 11 * 16] ; [27]
6574 pmulhrsw m1, m7
6575 palignr m2, m0, 6
6576 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
6577 pmulhrsw m3, m7
6578 packuswb m1, m3
6579
6580 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6581
6582 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
6583 pmulhrsw m4, m7
6584 movu m0, [r2 + 5]
6585 palignr m1, m0, 1
6586 punpckhbw m2, m0, m1
6587 punpcklbw m0, m1
6588 mova m6, m0
6589 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
6590 pmulhrsw m1, m7
6591 packuswb m4, m1
6592 pmaddubsw m5, m6, [r3 - 16] ; [15]
6593 pmulhrsw m5, m7
6594 pmaddubsw m6, [r3 + 12 * 16] ; [28]
6595 pmulhrsw m6, m7
6596 packuswb m5, m6
6597 palignr m3, m2, m0, 2
6598 pmaddubsw m6, m3, [r3 - 7 * 16] ; [9]
6599 pmulhrsw m6, m7
6600 pmaddubsw m3, [r3 + 6 * 16] ; [22]
6601 pmulhrsw m3, m7
6602 packuswb m6, m3
6603 palignr m2, m0, 4
6604 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
6605 pmulhrsw m1, m7
6606 pmaddubsw m3, m2, [r3] ; [16]
6607 pmulhrsw m3, m7
6608 packuswb m1, m3
6609
6610 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6611
6612 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
6613 pmulhrsw m4, m7
6614 movu m0, [r2 + 7]
6615 palignr m1, m0, 1
6616 punpckhbw m2, m0, m1
6617 punpcklbw m0, m1
6618 palignr m5, m2, m0, 2
6619 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
6620 pmulhrsw m1, m7
6621 packuswb m4, m1
6622 pmaddubsw m5, [r3 + 7 * 16] ; [23]
6623 pmulhrsw m5, m7
6624 palignr m1, m2, m0, 4
6625 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
6626 pmulhrsw m6, m7
6627 packuswb m5, m6
6628 pmaddubsw m6, m1, [r3 + 16] ; [17]
6629 pmulhrsw m6, m7
6630 pmaddubsw m1, [r3 + 14 * 16] ; [30]
6631 pmulhrsw m1, m7
6632 packuswb m6, m1
6633 palignr m2, m2, m0, 6
6634 pmaddubsw m1, m2, [r3 - 5 * 16] ; [11]
6635 pmulhrsw m1, m7
6636 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
6637 pmulhrsw m2, m7
6638 packuswb m1, m2
6639
6640 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6641
6642 movu m0, [r2 + 11]
6643 palignr m1, m0, 1
6644 punpckhbw m2, m0, m1
6645 punpcklbw m0, m1
6646 mova m5, m0
6647 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
6648 pmulhrsw m4, m7
6649 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
6650 pmulhrsw m3, m7
6651 packuswb m4, m3
6652 pmaddubsw m5, [r3 + 15 * 16] ; [31]
6653 pmulhrsw m5, m7
6654 palignr m6, m2, m0, 2
6655 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
6656 pmulhrsw m1, m7
6657 packuswb m5, m1
6658 pmaddubsw m6, [r3 + 9 * 16] ; [25]
6659 pmulhrsw m6, m7
6660 palignr m1, m2, m0, 4
6661 pmaddubsw m2, m1, [r3 - 10 * 16] ; [6]
6662 pmulhrsw m2, m7
6663 packuswb m6, m2
6664 pmaddubsw m1, [r3 + 3 * 16] ; [19]
6665 pmulhrsw m1, m7
6666 packuswb m1, m1
6667 movhps m1, [r2 + 14] ; [00]
6668
6669 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6670 %endmacro
6671 ;------------------------------------------------------------------------------------------------------------------
6672 ; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6673 ;------------------------------------------------------------------------------------------------------------------
6674 INIT_XMM sse4
6675 cglobal intra_pred_ang32_6, 3,7,8
6676 lea r3, [ang_table + 16 * 16]
6677 mov r4d, 4
6678 lea r5, [r1 * 3] ; r5 -> 3 * stride
6679 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6680 mova m7, [pw_1024]
6681 .loop:
6682 MODE_6_30 1
6683 lea r0, [r6 + r1 * 4]
6684 lea r6, [r6 + r1 * 8]
6685 add r2, 8
6686 dec r4
6687 jnz .loop
6688 RET
6689
6690 %macro MODE_7_29 1
6691 movu m0, [r2 + 1]
6692 palignr m1, m0, 1
6693 punpckhbw m2, m0, m1
6694 punpcklbw m0, m1
6695 mova m5, m0
6696 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
6697 pmulhrsw m4, m7
6698 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
6699 pmulhrsw m3, m7
6700 packuswb m4, m3
6701 pmaddubsw m5, [r3 + 11 * 16] ; [27]
6702 pmulhrsw m5, m7
6703 palignr m1, m2, m0, 2
6704 palignr m2, m0, 4
6705 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
6706 pmulhrsw m6, m7
6707 packuswb m5, m6
6708 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
6709 pmulhrsw m6, m7
6710 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
6711 pmulhrsw m0, m7
6712 packuswb m6, m0
6713 pmaddubsw m1, [r3 + 15 * 16] ; [31]
6714 pmulhrsw m1, m7
6715 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
6716 pmulhrsw m0, m7
6717 packuswb m1, m0
6718
6719 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6720
6721 pmaddubsw m4, m2, [r3 + 16] ; [17]
6722 pmulhrsw m4, m7
6723 pmaddubsw m2, [r3 + 10 * 16] ; [26]
6724 pmulhrsw m2, m7
6725 packuswb m4, m2
6726 movu m0, [r2 + 4]
6727 palignr m1, m0, 1
6728 punpckhbw m2, m0, m1
6729 punpcklbw m0, m1
6730 palignr m2, m0, 2
6731 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
6732 pmulhrsw m5, m7
6733 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
6734 pmulhrsw m6, m7
6735 packuswb m5, m6
6736 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
6737 pmulhrsw m6, m7
6738 pmaddubsw m0, [r3 + 14 * 16] ; [30]
6739 pmulhrsw m0, m7
6740 packuswb m6, m0
6741 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
6742 pmulhrsw m1, m7
6743 pmaddubsw m3, m2, [r3] ; [16]
6744 pmulhrsw m3, m7
6745 packuswb m1, m3
6746
6747 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6748
6749 pmaddubsw m4, m2, [r3 + 9 * 16] ; [25]
6750 pmulhrsw m4, m7
6751 movu m0, [r2 + 6]
6752 palignr m1, m0, 1
6753 punpckhbw m2, m0, m1
6754 punpcklbw m0, m1
6755 palignr m2, m0, 2
6756 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
6757 pmulhrsw m1, m7
6758 packuswb m4, m1
6759 pmaddubsw m5, m0, [r3 - 5 * 16] ; [11]
6760 pmulhrsw m5, m7
6761 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
6762 pmulhrsw m6, m7
6763 packuswb m5, m6
6764 pmaddubsw m6, m0, [r3 + 13 * 16] ; [29]
6765 pmulhrsw m6, m7
6766 pmaddubsw m1, m2, [r3 - 10 * 16] ; [6]
6767 pmulhrsw m1, m7
6768 packuswb m6, m1
6769 pmaddubsw m1, m2, [r3 - 16] ; [15]
6770 pmulhrsw m1, m7
6771 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
6772 pmulhrsw m2, m7
6773 packuswb m1, m2
6774
6775 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6776
6777 movu m0, [r2 + 8]
6778 palignr m1, m0, 1
6779 punpckhbw m2, m0, m1
6780 punpcklbw m0, m1
6781 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
6782 pmulhrsw m4, m7
6783 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
6784 pmulhrsw m3, m7
6785 packuswb m4, m3
6786 pmaddubsw m5, m0, [r3 + 3 * 16] ; [19]
6787 pmulhrsw m5, m7
6788 pmaddubsw m6, m0, [r3 + 12 * 16] ; [28]
6789 pmulhrsw m6, m7
6790 packuswb m5, m6
6791 palignr m2, m0, 2
6792 pmaddubsw m6, m2, [r3 - 11 * 16] ; [5]
6793 pmulhrsw m6, m7
6794 pmaddubsw m0, m2, [r3 - 2 * 16] ; [14]
6795 pmulhrsw m0, m7
6796 packuswb m6, m0
6797 pmaddubsw m1, m2, [r3 + 7 * 16] ; [23]
6798 pmulhrsw m1, m7
6799 packuswb m1, m1
6800 movhps m1, [r2 + 10] ; [0]
6801
6802 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6803 %endmacro
6804 ;------------------------------------------------------------------------------------------------------------------
6805 ; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6806 ;------------------------------------------------------------------------------------------------------------------
6807 INIT_XMM sse4
6808 cglobal intra_pred_ang32_7, 3,7,8
6809 lea r3, [ang_table + 16 * 16]
6810 mov r4d, 4
6811 lea r5, [r1 * 3] ; r5 -> 3 * stride
6812 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6813 mova m7, [pw_1024]
6814 .loop:
6815 MODE_7_29 1
6816 lea r0, [r6 + r1 * 4]
6817 lea r6, [r6 + r1 * 8]
6818 add r2, 8
6819 dec r4
6820 jnz .loop
6821 RET
6822
6823 %macro MODE_8_28 1
6824 movu m0, [r2 + 1]
6825 palignr m1, m0, 1
6826 punpckhbw m2, m0, m1
6827 punpcklbw m0, m1
6828 palignr m2, m0, 2
6829 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
6830 pmulhrsw m4, m7
6831 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
6832 pmulhrsw m3, m7
6833 packuswb m4, m3
6834 pmaddubsw m5, m0, [r3 - 1 * 16] ; [15]
6835 pmulhrsw m5, m7
6836 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
6837 pmulhrsw m6, m7
6838 packuswb m5, m6
6839 pmaddubsw m6, m0, [r3 + 9 * 16] ; [25]
6840 pmulhrsw m6, m7
6841 pmaddubsw m0, [r3 + 14 * 16] ; [30]
6842 pmulhrsw m0, m7
6843 packuswb m6, m0
6844 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
6845 pmulhrsw m1, m7
6846 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
6847 pmulhrsw m0, m7
6848 packuswb m1, m0
6849
6850 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6851
6852 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
6853 pmulhrsw m4, m7
6854 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
6855 pmulhrsw m5, m7
6856 packuswb m4, m5
6857 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
6858 pmulhrsw m5, m7
6859 pmaddubsw m2, [r3 + 12 * 16] ; [28]
6860 pmulhrsw m2, m7
6861 packuswb m5, m2
6862 movu m0, [r2 + 3]
6863 palignr m1, m0, 1
6864 punpckhbw m2, m0, m1
6865 punpcklbw m0, m1
6866 pmaddubsw m6, m0, [r3 - 15 * 16] ; [01]
6867 pmulhrsw m6, m7
6868 pmaddubsw m1, m0, [r3 - 10 * 16] ; [06]
6869 pmulhrsw m1, m7
6870 packuswb m6, m1
6871 pmaddubsw m1, m0, [r3 - 5 * 16] ; [11]
6872 pmulhrsw m1, m7
6873 mova m2, m0
6874 pmaddubsw m0, [r3] ; [16]
6875 pmulhrsw m0, m7
6876 packuswb m1, m0
6877
6878 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6879
6880 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
6881 pmulhrsw m4, m7
6882 pmaddubsw m5, m2, [r3 + 10 * 16] ; [26]
6883 pmulhrsw m5, m7
6884 packuswb m4, m5
6885 pmaddubsw m5, m2, [r3 + 15 * 16] ; [31]
6886 pmulhrsw m5, m7
6887 movu m0, [r2 + 4]
6888 palignr m1, m0, 1
6889 punpckhbw m2, m0, m1
6890 punpcklbw m0, m1
6891 pmaddubsw m2, m0, [r3 - 12 * 16] ; [4]
6892 pmulhrsw m2, m7
6893 packuswb m5, m2
6894 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
6895 pmulhrsw m6, m7
6896 pmaddubsw m1, m0, [r3 - 2 * 16] ; [14]
6897 pmulhrsw m1, m7
6898 packuswb m6, m1
6899 pmaddubsw m1, m0, [r3 + 3 * 16] ; [19]
6900 pmulhrsw m1, m7
6901 mova m2, m0
6902 pmaddubsw m0, [r3 + 8 * 16] ; [24]
6903 pmulhrsw m0, m7
6904 packuswb m1, m0
6905
6906 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6907
6908 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
6909 pmulhrsw m4, m7
6910 movu m0, [r2 + 5]
6911 palignr m1, m0, 1
6912 punpckhbw m2, m0, m1
6913 punpcklbw m0, m1
6914 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
6915 pmulhrsw m1, m7
6916 packuswb m4, m1
6917 pmaddubsw m5, m0, [r3 - 9 * 16] ; [7]
6918 pmulhrsw m5, m7
6919 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
6920 pmulhrsw m6, m7
6921 packuswb m5, m6
6922 pmaddubsw m6, m0, [r3 + 16] ; [17]
6923 pmulhrsw m6, m7
6924 pmaddubsw m1, m0, [r3 + 6 * 16] ; [22]
6925 pmulhrsw m1, m7
6926 packuswb m6, m1
6927 pmaddubsw m1, m0, [r3 + 11 * 16] ; [27]
6928 pmulhrsw m1, m7
6929 packuswb m1, m1
6930 movhps m1, [r2 + 6] ; [00]
6931
6932 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6933 %endmacro
6934 ;------------------------------------------------------------------------------------------------------------------
6935 ; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6936 ;------------------------------------------------------------------------------------------------------------------
6937 INIT_XMM sse4
6938 cglobal intra_pred_ang32_8, 3,7,8
6939 lea r3, [ang_table + 16 * 16]
6940 mov r4d, 4
6941 lea r5, [r1 * 3] ; r5 -> 3 * stride
6942 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6943 mova m7, [pw_1024]
6944 .loop:
6945 MODE_8_28 1
6946 lea r0, [r6 + r1 * 4]
6947 lea r6, [r6 + r1 * 8]
6948 add r2, 8
6949 dec r4
6950 jnz .loop
6951 RET
6952
6953 %macro MODE_9_27 1
6954 movu m2, [r2 + 1]
6955 palignr m1, m2, 1
6956 punpckhbw m0, m2, m1
6957 punpcklbw m2, m1
6958 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
6959 pmulhrsw m4, m7
6960 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
6961 pmulhrsw m3, m7
6962 packuswb m4, m3
6963 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
6964 pmulhrsw m5, m7
6965 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
6966 pmulhrsw m6, m7
6967 packuswb m5, m6
6968 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
6969 pmulhrsw m6, m7
6970 pmaddubsw m3, m2, [r3 - 4 * 16] ; [12]
6971 pmulhrsw m3, m7
6972 packuswb m6, m3
6973 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
6974 pmulhrsw m1, m7
6975 pmaddubsw m0, m2, [r3] ; [16]
6976 pmulhrsw m0, m7
6977 packuswb m1, m0
6978
6979 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6980
6981 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
6982 pmulhrsw m4, m7
6983 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
6984 pmulhrsw m5, m7
6985 packuswb m4, m5
6986 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
6987 pmulhrsw m5, m7
6988 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
6989 pmulhrsw m6, m7
6990 packuswb m5, m6
6991 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
6992 pmulhrsw m6, m7
6993 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
6994 pmulhrsw m1, m7
6995 packuswb m6, m1
6996 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
6997 pmulhrsw m1, m7
6998 packuswb m1, m1
6999 movhps m1, [r2 + 2] ; [00]
7000
7001 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7002
7003 movu m2, [r2 + 2]
7004 palignr m1, m2, 1
7005 punpcklbw m2, m1
7006 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
7007 pmulhrsw m4, m7
7008 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
7009 pmulhrsw m3, m7
7010 packuswb m4, m3
7011 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
7012 pmulhrsw m5, m7
7013 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
7014 pmulhrsw m6, m7
7015 packuswb m5, m6
7016 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
7017 pmulhrsw m6, m7
7018 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
7019 pmulhrsw m0, m7
7020 packuswb m6, m0
7021 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
7022 pmulhrsw m1, m7
7023 pmaddubsw m0, m2, [r3] ; [16]
7024 pmulhrsw m0, m7
7025 packuswb m1, m0
7026
7027 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7028
7029 movu m2, [r2 + 2]
7030 palignr m1, m2, 1
7031 punpcklbw m2, m1
7032 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
7033 pmulhrsw m4, m7
7034 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
7035 pmulhrsw m5, m7
7036 packuswb m4, m5
7037 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
7038 pmulhrsw m5, m7
7039 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
7040 pmulhrsw m6, m7
7041 packuswb m5, m6
7042 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
7043 pmulhrsw m6, m7
7044 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
7045 pmulhrsw m1, m7
7046 packuswb m6, m1
7047 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
7048 pmulhrsw m1, m7
7049 packuswb m1, m1
7050 movhps m1, [r2 + 3] ; [00]
7051
7052 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7053 %endmacro
7054 ;------------------------------------------------------------------------------------------------------------------
7055 ; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7056 ;------------------------------------------------------------------------------------------------------------------
7057 INIT_XMM sse4
7058 cglobal intra_pred_ang32_9, 3,7,8
7059 lea r3, [ang_table + 16 * 16]
7060 mov r4d, 4
7061 lea r5, [r1 * 3] ; r5 -> 3 * stride
7062 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7063 mova m7, [pw_1024]
7064 .loop:
7065 MODE_9_27 1
7066 lea r0, [r6 + r1 * 4]
7067 lea r6, [r6 + r1 * 8]
7068 add r2, 8
7069 dec r4
7070 jnz .loop
7071 RET
7072
7073 ;------------------------------------------------------------------------------------------------------------------
7074 ; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7075 ;------------------------------------------------------------------------------------------------------------------
7076 INIT_XMM sse4
7077 cglobal intra_pred_ang32_10, 6,7,8,0-(2*mmsize)
7078 %define m8 [rsp + 0 * mmsize]
7079 %define m9 [rsp + 1 * mmsize]
7080 lea r4, [r1 * 3]
7081 pxor m7, m7
7082 mov r6, 2
7083 movu m0, [r3]
7084 movu m1, [r3 + 1]
7085 mova m8, m0
7086 mova m9, m1
7087 mov r3d, r5d
7088
7089 .loop:
7090 movu m0, [r2 + 1]
7091 palignr m1, m0, 1
7092 pshufb m1, m7
7093 palignr m2, m0, 2
7094 pshufb m2, m7
7095 palignr m3, m0, 3
7096 pshufb m3, m7
7097 palignr m4, m0, 4
7098 pshufb m4, m7
7099 palignr m5, m0, 5
7100 pshufb m5, m7
7101 palignr m6, m0, 6
7102 pshufb m6, m7
7103
7104 movu [r0 + r1], m1
7105 movu [r0 + r1 + 16], m1
7106 movu [r0 + r1 * 2], m2
7107 movu [r0 + r1 * 2 + 16], m2
7108 movu [r0 + r4], m3
7109 movu [r0 + r4 + 16], m3
7110 lea r5, [r0 + r1 * 4]
7111 movu [r5], m4
7112 movu [r5 + 16], m4
7113 movu [r5 + r1], m5
7114 movu [r5 + r1 + 16], m5
7115 movu [r5 + r1 * 2], m6
7116 movu [r5 + r1 * 2 + 16], m6
7117
7118 palignr m1, m0, 7
7119 pshufb m1, m7
7120 movhlps m2, m0
7121 pshufb m2, m7
7122 palignr m3, m0, 9
7123 pshufb m3, m7
7124 palignr m4, m0, 10
7125 pshufb m4, m7
7126 palignr m5, m0, 11
7127 pshufb m5, m7
7128 palignr m6, m0, 12
7129 pshufb m6, m7
7130
7131 movu [r5 + r4], m1
7132 movu [r5 + r4 + 16], m1
7133 lea r5, [r5 + r1 * 4]
7134 movu [r5], m2
7135 movu [r5 + 16], m2
7136 movu [r5 + r1], m3
7137 movu [r5 + r1 + 16], m3
7138 movu [r5 + r1 * 2], m4
7139 movu [r5 + r1 * 2 + 16], m4
7140 movu [r5 + r4], m5
7141 movu [r5 + r4 + 16], m5
7142 lea r5, [r5 + r1 * 4]
7143 movu [r5], m6
7144 movu [r5 + 16], m6
7145
7146 palignr m1, m0, 13
7147 pshufb m1, m7
7148 palignr m2, m0, 14
7149 pshufb m2, m7
7150 palignr m3, m0, 15
7151 pshufb m3, m7
7152 pshufb m0, m7
7153
7154 movu [r5 + r1], m1
7155 movu [r5 + r1 + 16], m1
7156 movu [r5 + r1 * 2], m2
7157 movu [r5 + r1 * 2 + 16], m2
7158 movu [r5 + r4], m3
7159 movu [r5 + r4 + 16], m3
7160
7161 ; filter
7162 cmp r3d, byte 0
7163 jz .quit
7164 movhlps m1, m0
7165 pmovzxbw m0, m0
7166 mova m1, m0
7167 movu m2, m8
7168 movu m3, m9
7169
7170 pshufb m2, m7
7171 pmovzxbw m2, m2
7172 movhlps m4, m3
7173 pmovzxbw m3, m3
7174 pmovzxbw m4, m4
7175 psubw m3, m2
7176 psubw m4, m2
7177 psraw m3, 1
7178 psraw m4, 1
7179 paddw m0, m3
7180 paddw m1, m4
7181 packuswb m0, m1
7182
7183 .quit:
7184 movu [r0], m0
7185 movu [r0 + 16], m0
7186 dec r6
7187 lea r0, [r5 + r1 * 4]
7188 lea r2, [r2 + 16]
7189 jnz .loop
7190 RET
7191
7192 ;-------------------------------------------------------------------------------------------------------------------
7193 ; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7194 ;-------------------------------------------------------------------------------------------------------------------
7195 INIT_XMM sse4
7196 cglobal intra_pred_ang32_11, 4,7,8
7197 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7198
7199 mov r6, rsp
7200 sub rsp, 64+gprsize
7201 and rsp, ~63
7202 mov [rsp+64], r6
7203
7204 ; collect reference pixel
7205 movu m0, [r3 + 16]
7206 pxor m1, m1
7207 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
7208 mova [rsp], m0
7209 movu m0, [r2]
7210 movu m1, [r2 + 16]
7211 movu m2, [r2 + 32]
7212 movu [rsp + 1], m0
7213 movu [rsp + 1 + 16], m1
7214 movu [rsp + 1 + 32], m2
7215 mov [rsp + 63], byte 4
7216
7217 ; filter
7218 lea r2, [rsp + 1] ; r2 -> [0]
7219 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7220 lea r4, [ang_table] ; r4 -> ang_table
7221 lea r5, [r1 * 3] ; r5 -> 3 * stride
7222 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7223 mova m5, [pw_1024] ; m5 -> 1024
7224 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7225
7226 .loop:
7227 ; Row[0 - 7]
7228 movu m7, [r2]
7229 mova m0, m7
7230 mova m1, m7
7231 mova m2, m7
7232 mova m3, m7
7233 mova m4, m7
7234 mova m5, m7
7235 mova m6, m7
7236 PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16
7237
7238 ; Row[8 - 15]
7239 movu m7, [r2]
7240 mova m0, m7
7241 mova m1, m7
7242 mova m2, m7
7243 mova m3, m7
7244 mova m4, m7
7245 mova m5, m7
7246 mova m6, m7
7247 PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0
7248
7249 ; Row[16 - 23]
7250 movu m7, [r2 - 1]
7251 mova m0, m7
7252 mova m1, m7
7253 mova m2, m7
7254 mova m3, m7
7255 mova m4, m7
7256 mova m5, m7
7257 mova m6, m7
7258 PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16
7259
7260 ; Row[24 - 31]
7261 movu m7, [r2 - 1]
7262 mova m0, m7
7263 mova m1, m7
7264 mova m2, m7
7265 mova m3, m7
7266 mova m4, m7
7267 mova m5, m7
7268 mova m6, m7
7269 PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0
7270
7271 lea r0, [r6 + r1 * 4]
7272 lea r6, [r6 + r1 * 8]
7273 add r2, 8
7274 dec byte [rsp + 63]
7275 jnz .loop
7276 mov rsp, [rsp+64]
7277 RET
7278
7279 %macro MODE_12_24_ROW0 1
7280 movu m0, [r3 + 6]
7281 pshufb m0, [c_mode32_12_0]
7282 pinsrb m0, [r3 + 26], 12
7283 mova above, m0
7284 movu m2, [r2]
7285 palignr m1, m2, 1
7286 punpcklbw m2, m1
7287 pmaddubsw m4, m2, [r4 + 11 * 16] ; [27]
7288 pmulhrsw m4, m7
7289 pmaddubsw m3, m2, [r4 + 6 * 16] ; [22]
7290 pmulhrsw m3, m7
7291 packuswb m4, m3
7292 pmaddubsw m5, m2, [r4 + 16] ; [17]
7293 pmulhrsw m5, m7
7294 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
7295 pmulhrsw m6, m7
7296 packuswb m5, m6
7297 pmaddubsw m6, m2, [r4 - 9 * 16] ; [7]
7298 pmulhrsw m6, m7
7299 pmaddubsw m3, m2, [r4 - 14 * 16] ; [2]
7300 pmulhrsw m3, m7
7301 packuswb m6, m3
7302 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7303 palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
7304 punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
7305 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
7306 pmulhrsw m1, m7
7307 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
7308 pmulhrsw m3, m7
7309 packuswb m1, m3
7310 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7311 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
7312 pmulhrsw m4, m7
7313 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
7314 pmulhrsw m5, m7
7315 packuswb m4, m5
7316 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
7317 pmulhrsw m5, m7
7318 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7319 pmulhrsw m6, m7
7320 packuswb m5, m6
7321 palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
7322 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
7323 pmulhrsw m6, m7
7324 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
7325 pmulhrsw m1, m7
7326 packuswb m6, m1
7327 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
7328 pmulhrsw m1, m7
7329 pmaddubsw m3, m2, [r4] ; [16]
7330 pmulhrsw m3, m7
7331 packuswb m1, m3
7332 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7333 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
7334 pmulhrsw m4, m7
7335 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
7336 pmulhrsw m3, m7
7337 packuswb m4, m3
7338 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
7339 pmulhrsw m5, m7
7340 pslldq m1, above, 1
7341 palignr m2, m1, 14
7342 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7343 pmulhrsw m6, m7
7344 packuswb m5, m6
7345 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
7346 pmulhrsw m6, m7
7347 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
7348 pmulhrsw m3, m7
7349 packuswb m6, m3
7350 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
7351 pmulhrsw m1, m7
7352 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
7353 pmulhrsw m3, m7
7354 packuswb m1, m3
7355 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7356 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
7357 pmulhrsw m4, m7
7358 pslldq m1, above, 2
7359 palignr m2, m1, 14
7360 pmaddubsw m5, m2, [r4 + 14 * 16] ; [30]
7361 pmulhrsw m5, m7
7362 packuswb m4, m5
7363 pmaddubsw m5, m2, [r4 + 9 * 16] ; [25]
7364 pmulhrsw m5, m7
7365 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
7366 pmulhrsw m6, m7
7367 packuswb m5, m6
7368 pmaddubsw m6, m2, [r4 - 16] ; [15]
7369 pmulhrsw m6, m7
7370 pmaddubsw m1, m2, [r4 - 6 * 16] ; [10]
7371 pmulhrsw m1, m7
7372 packuswb m6, m1
7373 pmaddubsw m1, m2, [r4 - 11 * 16] ; [05]
7374 pmulhrsw m1, m7
7375 movu m0, [pb_fact0]
7376 pshufb m2, m0
7377 pmovzxbw m2, m2
7378 packuswb m1, m2
7379 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7380 %endmacro
7381
7382 %macro MODE_12_24 1
7383 movu m2, [r2]
7384 palignr m1, m2, 1
7385 punpckhbw m0, m2, m1
7386 punpcklbw m2, m1
7387 palignr m0, m2, 2
7388 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
7389 pmulhrsw m4, m7
7390 pmaddubsw m3, m0, [r4 + 6 * 16] ; [22]
7391 pmulhrsw m3, m7
7392 packuswb m4, m3
7393 pmaddubsw m5, m0, [r4 + 16] ; [17]
7394 pmulhrsw m5, m7
7395 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
7396 pmulhrsw m6, m7
7397 packuswb m5, m6
7398 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
7399 pmulhrsw m6, m7
7400 pmaddubsw m3, m0, [r4 - 14 * 16] ; [2]
7401 pmulhrsw m3, m7
7402 packuswb m6, m3
7403 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
7404 pmulhrsw m1, m7
7405 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
7406 pmulhrsw m3, m7
7407 packuswb m1, m3
7408 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7409 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
7410 pmulhrsw m4, m7
7411 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
7412 pmulhrsw m5, m7
7413 packuswb m4, m5
7414 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
7415 pmulhrsw m5, m7
7416 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7417 pmulhrsw m6, m7
7418 packuswb m5, m6
7419 movu m0, [r2 - 2]
7420 palignr m1, m0, 1
7421 punpckhbw m2, m0, m1
7422 punpcklbw m0, m1
7423 palignr m2, m0, 2
7424 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
7425 pmulhrsw m6, m7
7426 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
7427 pmulhrsw m1, m7
7428 packuswb m6, m1
7429 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
7430 pmulhrsw m1, m7
7431 pmaddubsw m3, m2, [r4] ; [16]
7432 pmulhrsw m3, m7
7433 packuswb m1, m3
7434 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7435 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
7436 pmulhrsw m4, m7
7437 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
7438 pmulhrsw m3, m7
7439 packuswb m4, m3
7440 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
7441 pmulhrsw m5, m7
7442 movu m0, [r2 - 3]
7443 palignr m1, m0, 1
7444 punpckhbw m2, m0, m1
7445 punpcklbw m0, m1
7446 palignr m2, m0, 2
7447 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7448 pmulhrsw m6, m7
7449 packuswb m5, m6
7450 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
7451 pmulhrsw m6, m7
7452 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
7453 pmulhrsw m3, m7
7454 packuswb m6, m3
7455 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
7456 pmulhrsw m1, m7
7457 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
7458 pmulhrsw m3, m7
7459 packuswb m1, m3
7460 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7461 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
7462 pmulhrsw m4, m7
7463 movu m2, [r2 - 4]
7464 palignr m1, m2, 1
7465 punpckhbw m0, m2, m1
7466 punpcklbw m2, m1
7467 palignr m0, m2, 2
7468 pmaddubsw m5, m0, [r4 + 14 * 16] ; [30]
7469 pmulhrsw m5, m7
7470 packuswb m4, m5
7471 pmaddubsw m5, m0, [r4 + 9 * 16] ; [25]
7472 pmulhrsw m5, m7
7473 pmaddubsw m6, m0, [r4 + 4 * 16] ; [20]
7474 pmulhrsw m6, m7
7475 packuswb m5, m6
7476 pmaddubsw m6, m0, [r4 - 16] ; [15]
7477 pmulhrsw m6, m7
7478 pmaddubsw m1, m0, [r4 - 6 * 16] ; [10]
7479 pmulhrsw m1, m7
7480 packuswb m6, m1
7481 pmaddubsw m1, m0, [r4 - 11 * 16] ; [05]
7482 pmulhrsw m1, m7
7483 movu m2, [pb_fact0]
7484 pshufb m0, m2
7485 pmovzxbw m0, m0
7486 packuswb m1, m0
7487 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7488 %endmacro
7489 ;-----------------------------------------------------------------------------------------------------------------
7490 ; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7491 ;-----------------------------------------------------------------------------------------------------------------
7492 INIT_XMM sse4
7493 cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize)
7494 %define above [rsp + 0 * mmsize]
7495
7496 lea r4, [ang_table + 16 * 16]
7497 lea r5, [r1 * 3] ; r5 -> 3 * stride
7498 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7499 mova m7, [pw_1024]
7500
7501 MODE_12_24_ROW0 1
7502 lea r0, [r6 + r1 * 4]
7503 lea r6, [r6 + r1 * 8]
7504 add r2, 7
7505 mov r3, 3
7506 .loop:
7507 MODE_12_24 1
7508 lea r0, [r6 + r1 * 4]
7509 lea r6, [r6 + r1 * 8]
7510 add r2, 8
7511 dec r3
7512 jnz .loop
7513 RET
7514
7515 %macro MODE_13_23_ROW0 1
7516 movu m0, [r3 + 1]
7517 movu m1, [r3 + 15]
7518 pshufb m0, [c_mode32_13_0]
7519 pshufb m1, [c_mode32_13_0]
7520 punpckldq m0, m1
7521 pshufb m0, [c_mode32_13_shuf]
7522 mova above, m0
7523 movu m2, [r2]
7524 palignr m1, m2, 1
7525 punpcklbw m2, m1
7526 pmaddubsw m4, m2, [r4 + 7 * 16] ; [23]
7527 pmulhrsw m4, m7
7528 pmaddubsw m3, m2, [r4 - 2 * 16] ; [14]
7529 pmulhrsw m3, m7
7530 packuswb m4, m3
7531 pmaddubsw m5, m2, [r4 - 11 * 16] ; [5]
7532 pmulhrsw m5, m7
7533 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7534 palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
7535 punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
7536 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7537 pmulhrsw m6, m7
7538 packuswb m5, m6
7539 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
7540 pmulhrsw m6, m7
7541 pmaddubsw m0, m2, [r4 - 6 * 16] ; [10]
7542 pmulhrsw m0, m7
7543 packuswb m6, m0
7544 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
7545 pmulhrsw m1, m7
7546 palignr m2, above, 14
7547 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
7548 pmulhrsw m3, m7
7549 packuswb m1, m3
7550 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7551 pmaddubsw m4, m2, [r4 - 16] ; [15]
7552 pmulhrsw m4, m7
7553 pmaddubsw m5, m2, [r4 - 10 * 16] ; [6]
7554 pmulhrsw m5, m7
7555 packuswb m4, m5
7556 pslldq m0, above, 1
7557 palignr m2, m0, 14
7558 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
7559 pmulhrsw m5, m7
7560 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
7561 pmulhrsw m6, m7
7562 packuswb m5, m6
7563 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
7564 pmulhrsw m6, m7
7565 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
7566 pmulhrsw m1, m7
7567 packuswb m6, m1
7568 pslldq m0, 1
7569 palignr m2, m0, 14
7570 pmaddubsw m1, m2, [r4 + 9 * 16] ; [25]
7571 pmulhrsw m1, m7
7572 pmaddubsw m0, m2, [r4] ; [16]
7573 pmulhrsw m0, m7
7574 packuswb m1, m0
7575 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7576 pmaddubsw m4, m2, [r4 - 9 * 16] ; [7]
7577 pmulhrsw m4, m7
7578 pslldq m0, above, 3
7579 palignr m2, m0, 14
7580 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
7581 pmulhrsw m3, m7
7582 packuswb m4, m3
7583 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
7584 pmulhrsw m5, m7
7585 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
7586 pmulhrsw m6, m7
7587 packuswb m5, m6
7588 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
7589 pmulhrsw m6, m7
7590 pslldq m0, 1
7591 palignr m2, m0, 14
7592 pmaddubsw m0, m2, [r4 + 10 * 16] ; [26]
7593 pmulhrsw m0, m7
7594 packuswb m6, m0
7595 pmaddubsw m1, m2, [r4 + 16] ; [17]
7596 pmulhrsw m1, m7
7597 pmaddubsw m0, m2, [r4 - 8 * 16] ; [8]
7598 pmulhrsw m0, m7
7599 packuswb m1, m0
7600 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7601 pslldq m0, above, 5
7602 palignr m2, m0, 14
7603 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
7604 pmulhrsw m4, m7
7605 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
7606 pmulhrsw m5, m7
7607 packuswb m4, m5
7608 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
7609 pmulhrsw m5, m7
7610 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7611 pmulhrsw m6, m7
7612 packuswb m5, m6
7613 pslldq m0, 1
7614 palignr m2, m0, 14
7615 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
7616 pmulhrsw m6, m7
7617 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
7618 pmulhrsw m1, m7
7619 packuswb m6, m1
7620 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
7621 pmulhrsw m1, m7
7622 pmaddubsw m3, m2, [r4 - 16 * 16] ; [00]
7623 pmulhrsw m3, m7
7624 packuswb m1, m3
7625 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7626 %endmacro
7627
7628 %macro MODE_13_23 1
7629 movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7630 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7631 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
7632 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7633 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
7634 pmaddubsw m4, m0, [r4 + 7 * 16] ; [23]
7635 pmulhrsw m4, m7
7636 pmaddubsw m3, m0, [r4 - 2 * 16] ; [14]
7637 pmulhrsw m3, m7
7638 packuswb m4, m3
7639 pmaddubsw m5, m0, [r4 - 11 * 16] ; [05]
7640 pmulhrsw m5, m7
7641 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7642 pmulhrsw m6, m7
7643 packuswb m5, m6
7644 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
7645 pmulhrsw m6, m7
7646 pmaddubsw m3, m2, [r4 - 6 * 16] ; [10]
7647 pmulhrsw m3, m7
7648 packuswb m6, m3
7649 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
7650 pmulhrsw m1, m7
7651 movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1]
7652 palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7653 punpckhbw m0, m2, m3
7654 punpcklbw m2, m3
7655 palignr m0, m2, 2
7656 pmaddubsw m3, m0, [r4 + 8 * 16] ; [24]
7657 pmulhrsw m3, m7
7658 packuswb m1, m3
7659 mova m3, m0
7660 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7661 pmaddubsw m4, m3, [r4 - 16] ; [15]
7662 pmulhrsw m4, m7
7663 pmaddubsw m5, m3, [r4 - 10 * 16] ; [6]
7664 pmulhrsw m5, m7
7665 packuswb m4, m5
7666 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
7667 pmulhrsw m5, m7
7668 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
7669 pmulhrsw m6, m7
7670 packuswb m5, m6
7671 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
7672 pmulhrsw m6, m7
7673 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
7674 pmulhrsw m1, m7
7675 packuswb m6, m1
7676 movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7677 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7678 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
7679 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7680 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
7681 pmaddubsw m1, m0, [r4 + 9 * 16] ; [25]
7682 pmulhrsw m1, m7
7683 pmaddubsw m3, m0, [r4] ; [16]
7684 pmulhrsw m3, m7
7685 packuswb m1, m3
7686 mova m3, m0
7687 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7688 pmaddubsw m4, m3, [r4 - 9 * 16] ; [7]
7689 pmulhrsw m4, m7
7690 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
7691 pmulhrsw m3, m7
7692 packuswb m4, m3
7693 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
7694 pmulhrsw m5, m7
7695 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
7696 pmulhrsw m6, m7
7697 packuswb m5, m6
7698 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
7699 pmulhrsw m6, m7
7700 movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7701 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7702 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
7703 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7704 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
7705 pmaddubsw m3, m0, [r4 + 10 * 16] ; [26]
7706 pmulhrsw m3, m7
7707 packuswb m6, m3
7708 pmaddubsw m1, m0, [r4 + 16] ; [17]
7709 pmulhrsw m1, m7
7710 pmaddubsw m3, m0, [r4 - 8 * 16] ; [8]
7711 pmulhrsw m3, m7
7712 packuswb m1, m3
7713 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7714 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
7715 pmulhrsw m4, m7
7716 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
7717 pmulhrsw m5, m7
7718 packuswb m4, m5
7719 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
7720 pmulhrsw m5, m7
7721 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7722 pmulhrsw m6, m7
7723 packuswb m5, m6
7724 movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7725 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7726 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7727 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
7728 pmulhrsw m6, m7
7729 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
7730 pmulhrsw m1, m7
7731 packuswb m6, m1
7732 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
7733 pmulhrsw m1, m7
7734 movu m0, [pb_fact0]
7735 pshufb m2, m0
7736 pmovzxbw m2, m2
7737 packuswb m1, m2
7738 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7739 %endmacro
7740 ;-----------------------------------------------------------------------------------------------------------------
7741 ; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7742 ;-----------------------------------------------------------------------------------------------------------------
7743 INIT_XMM sse4
7744 cglobal intra_pred_ang32_13, 4,7,8,0-(1*mmsize)
7745 %define above [rsp + 0 * mmsize]
7746 lea r4, [ang_table + 16 * 16]
7747 lea r5, [r1 * 3] ; r5 -> 3 * stride
7748 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7749 mova m7, [pw_1024]
7750
7751 MODE_13_23_ROW0 1
7752 lea r0, [r6 + r1 * 4]
7753 lea r6, [r6 + r1 * 8]
7754 add r2, 7
7755 mov r3, 3
7756 .loop:
7757 MODE_13_23 1
7758 lea r0, [r6 + r1 * 4]
7759 lea r6, [r6 + r1 * 8]
7760 add r2, 8
7761 dec r3
7762 jnz .loop
7763 RET
7764
7765 ;-------------------------------------------------------------------------------------------------------------------
7766 ; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7767 ;-------------------------------------------------------------------------------------------------------------------
7768 INIT_XMM sse4
7769 cglobal intra_pred_ang32_14, 4,7,8
7770 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7771 mov r6, rsp
7772 sub rsp, 64+gprsize
7773 and rsp, ~63
7774 mov [rsp+64], r6
7775
7776 ; collect reference pixel
7777 movu m0, [r3]
7778 movu m1, [r3 + 15]
7779 pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
7780 pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
7781 pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
7782 palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
7783 mova [rsp], m0
7784 movu m0, [r2 + 1]
7785 movu m1, [r2 + 1 + 16]
7786 movu [rsp + 13], m0
7787 movu [rsp + 13 + 16], m1
7788 mov [rsp + 63], byte 4
7789
7790 ; filter
7791 lea r2, [rsp + 13] ; r2 -> [0]
7792 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7793 lea r4, [ang_table] ; r4 -> ang_table
7794 lea r5, [r1 * 3] ; r5 -> 3 * stride
7795 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7796 mova m5, [pw_1024] ; m5 -> 1024
7797 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7798
7799 .loop:
7800 ; Row[0 - 7]
7801 movu m7, [r2 - 4]
7802 palignr m0, m7, 3
7803 mova m1, m0
7804 palignr m2, m7, 2
7805 mova m3, m2
7806 palignr m4, m7, 1
7807 mova m5, m4
7808 mova m6, m4
7809 PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24
7810
7811 ; Row[8 - 15]
7812 movu m7, [r2 - 7]
7813 palignr m0, m7, 3
7814 palignr m1, m7, 2
7815 mova m2, m1
7816 mova m3, m1
7817 palignr m4, m7, 1
7818 mova m5, m4
7819 mova m6, m7
7820 PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16
7821
7822 ; Row[16 - 23]
7823 movu m7, [r2 - 10]
7824 palignr m0, m7, 3
7825 palignr m1, m7, 2
7826 mova m2, m1
7827 palignr m3, m7, 1
7828 mova m4, m3
7829 mova m5, m3
7830 mova m6, m7
7831 PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8
7832
7833 ; Row[24 - 31]
7834 movu m7, [r2 - 13]
7835 palignr m0, m7, 2
7836 mova m1, m0
7837 mova m2, m0
7838 palignr m3, m7, 1
7839 mova m4, m3
7840 mova m5, m7
7841 mova m6, m7
7842 PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0
7843
7844 lea r0, [r6 + r1 * 4]
7845 lea r6, [r6 + r1 * 8]
7846 add r2, 8
7847 dec byte [rsp + 63]
7848 jnz .loop
7849 mov rsp, [rsp+64]
7850 RET
7851
7852 ;-------------------------------------------------------------------------------------------------------------------
7853 ; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7854 ;-------------------------------------------------------------------------------------------------------------------
7855 INIT_XMM sse4
7856 cglobal intra_pred_ang32_15, 4,7,8
7857 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7858 mov r6, rsp
7859 sub rsp, 64+gprsize
7860 and rsp, ~63
7861 mov [rsp+64], r6
7862
7863 ; collect reference pixel
7864 movu m0, [r3]
7865 movu m1, [r3 + 15]
7866 pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
7867 pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
7868 mova [rsp], m1
7869 movu [rsp + 8], m0
7870 movu m0, [r2 + 1]
7871 movu m1, [r2 + 1 + 16]
7872 movu [rsp + 17], m0
7873 movu [rsp + 17 + 16], m1
7874 mov [rsp + 63], byte 4
7875
7876 ; filter
7877 lea r2, [rsp + 17] ; r2 -> [0]
7878 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7879 lea r4, [ang_table] ; r4 -> ang_table
7880 lea r5, [r1 * 3] ; r5 -> 3 * stride
7881 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7882 mova m5, [pw_1024] ; m5 -> 1024
7883 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7884
7885 .loop:
7886 ; Row[0 - 7]
7887 movu m7, [r2 - 5]
7888 palignr m0, m7, 4
7889 palignr m1, m7, 3
7890 mova m2, m1
7891 palignr m3, m7, 2
7892 mova m4, m3
7893 palignr m5, m7, 1
7894 mova m6, m5
7895 PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24
7896
7897 ; Row[8 - 15]
7898 movu m7, [r2 - 9]
7899 palignr m0, m7, 4
7900 palignr m1, m7, 3
7901 mova m2, m1
7902 palignr m3, m7, 2
7903 mova m4, m3
7904 palignr m5, m7, 1
7905 mova m6, m5
7906 PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16
7907
7908 ; Row[16 - 23]
7909 movu m7, [r2 - 13]
7910 palignr m0, m7, 3
7911 mova m1, m0
7912 palignr m2, m7, 2
7913 mova m3, m2
7914 palignr m4, m7, 1
7915 mova m5, m4
7916 mova m6, m7
7917 PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8
7918
7919 ; Row[24 - 31]
7920 movu m7, [r2 - 17]
7921 palignr m0, m7, 3
7922 mova m1, m0
7923 palignr m2, m7, 2
7924 mova m3, m2
7925 palignr m4, m7, 1
7926 mova m5, m4
7927 mova m6, m7
7928 PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0
7929
7930 lea r0, [r6 + r1 * 4]
7931 lea r6, [r6 + r1 * 8]
7932 add r2, 8
7933 dec byte [rsp + 63]
7934 jnz .loop
7935 mov rsp, [rsp+64]
7936 RET
7937
7938 ;-------------------------------------------------------------------------------------------------------------------
7939 ; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7940 ;-------------------------------------------------------------------------------------------------------------------
7941 INIT_XMM sse4
7942 cglobal intra_pred_ang32_16, 4,7,8
7943 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7944 mov r6, rsp
7945 sub rsp, 64+gprsize
7946 and rsp, ~63
7947 mov [rsp+64], r6
7948
7949 ; collect reference pixel
7950 movu m0, [r3]
7951 movu m1, [r3 + 15]
7952 pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
7953 pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
7954 mova [rsp], m1
7955 movu [rsp + 10], m0
7956 movu m0, [r2 + 1]
7957 movu m1, [r2 + 1 + 16]
7958 movu [rsp + 21], m0
7959 movu [rsp + 21 + 16], m1
7960 mov [rsp + 63], byte 4
7961
7962 ; filter
7963 lea r2, [rsp + 21] ; r2 -> [0]
7964 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7965 lea r4, [ang_table] ; r4 -> ang_table
7966 lea r5, [r1 * 3] ; r5 -> 3 * stride
7967 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7968 mova m5, [pw_1024] ; m5 -> 1024
7969 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7970
7971 .loop:
7972 ; Row[0 - 7]
7973 movu m7, [r2 - 6]
7974 palignr m0, m7, 5
7975 palignr m1, m7, 4
7976 mova m2, m1
7977 palignr m3, m7, 3
7978 palignr m4, m7, 2
7979 mova m5, m4
7980 palignr m6, m7, 1
7981 PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24
7982
7983 ; Row[8 - 15]
7984 movu m7, [r2 - 11]
7985 palignr m0, m7, 5
7986 palignr m1, m7, 4
7987 palignr m2, m7, 3
7988 mova m3, m2
7989 palignr m4, m7, 2
7990 palignr m5, m7, 1
7991 mova m6, m5
7992 PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16
7993
7994 ; Row[16 - 23]
7995 movu m7, [r2 - 16]
7996 palignr m0, m7, 4
7997 mova m1, m0
7998 palignr m2, m7, 3
7999 palignr m3, m7, 2
8000 mova m4, m3
8001 palignr m5, m7, 1
8002 mova m6, m7
8003 PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8
8004
8005 ; Row[24 - 31]
8006 movu m7, [r2 - 21]
8007 palignr m0, m7, 4
8008 palignr m1, m7, 3
8009 mova m2, m1
8010 palignr m3, m7, 2
8011 palignr m4, m7, 1
8012 mova m5, m4
8013 mova m6, m7
8014 PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0
8015
8016 lea r0, [r6 + r1 * 4]
8017 lea r6, [r6 + r1 * 8]
8018 add r2, 8
8019 dec byte [rsp + 63]
8020 jnz .loop
8021 mov rsp, [rsp+64]
8022 RET
8023
8024 ;------------------------------------------------------------------------------------------------------------------
8025 ; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8026 ;------------------------------------------------------------------------------------------------------------------
8027 INIT_XMM sse4
8028 cglobal intra_pred_ang32_17, 4,7,8
8029 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8030 mov r6, rsp
8031 sub rsp, 64+gprsize
8032 and rsp, ~63
8033 mov [rsp+64], r6
8034
8035 ; collect reference pixel
8036 movu m0, [r3]
8037 movu m1, [r3 + 16]
8038 pshufb m0, [c_mode32_17_0]
8039 pshufb m1, [c_mode32_17_0]
8040 mova [rsp ], m1
8041 movu [rsp + 13], m0
8042 movu m0, [r2 + 1]
8043 movu m1, [r2 + 1 + 16]
8044 movu [rsp + 26], m0
8045 movu [rsp + 26 + 16], m1
8046 mov [rsp + 63], byte 4
8047
8048 ; filter
8049 lea r2, [rsp + 25] ; r2 -> [0]
8050 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8051 lea r4, [ang_table] ; r4 -> ang_table
8052 lea r5, [r1 * 3] ; r5 -> 3 * stride
8053 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
8054 mova m5, [pw_1024] ; m5 -> 1024
8055 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8056
8057 .loop:
8058 ; Row[0 - 7]
8059 movu m7, [r2 - 6]
8060 palignr m0, m7, 6
8061 palignr m1, m7, 5
8062 palignr m2, m7, 4
8063 palignr m3, m7, 3
8064 palignr m4, m7, 2
8065 mova m5, m4
8066 palignr m6, m7, 1
8067 PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16
8068
8069 ; Row[7 - 15]
8070 movu m7, [r2 - 12]
8071 palignr m0, m7, 5
8072 palignr m1, m7, 4
8073 mova m2, m1
8074 palignr m3, m7, 3
8075 palignr m4, m7, 2
8076 palignr m5, m7, 1
8077 mova m6, m7
8078 PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0
8079
8080 ; Row[16 - 23]
8081 movu m7, [r2 - 19]
8082 palignr m0, m7, 6
8083 palignr m1, m7, 5
8084 palignr m2, m7, 4
8085 palignr m3, m7, 3
8086 palignr m4, m7, 2
8087 mova m5, m4
8088 palignr m6, m7, 1
8089 PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16
8090
8091 ; Row[24 - 31]
8092 movu m7, [r2 - 25]
8093 palignr m0, m7, 5
8094 palignr m1, m7, 4
8095 mova m2, m1
8096 palignr m3, m7, 3
8097 palignr m4, m7, 2
8098 palignr m5, m7, 1
8099 mova m6, m7
8100 PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0
8101
8102 lea r0, [r6 + r1 * 4]
8103 lea r6, [r6 + r1 * 8]
8104 add r2, 8
8105 dec byte [rsp + 63]
8106 jnz .loop
8107 mov rsp, [rsp+64]
8108
8109 RET
8110
8111 ;-------------------------------------------------------------------------------------------------------------------
8112 ; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8113 ;-------------------------------------------------------------------------------------------------------------------
8114 INIT_XMM sse4
8115 cglobal intra_pred_ang32_18, 4,5,5
8116 movu m0, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
8117 movu m1, [r3 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16]
8118 movu m2, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
8119 movu m3, [r2 + 17] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
8120
8121 lea r2, [r1 * 2]
8122 lea r3, [r1 * 3]
8123 lea r4, [r1 * 4]
8124
8125 movu [r0], m0
8126 movu [r0 + 16], m1
8127
8128 pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
8129 pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32]
8130
8131 palignr m4, m0, m2, 15
8132 movu [r0 + r1], m4
8133 palignr m4, m1, m0, 15
8134 movu [r0 + r1 + 16], m4
8135 palignr m4, m0, m2, 14
8136 movu [r0 + r2], m4
8137 palignr m4, m1, m0, 14
8138 movu [r0 + r2 + 16], m4
8139 palignr m4, m0, m2, 13
8140 movu [r0 + r3], m4
8141 palignr m4, m1, m0, 13
8142 movu [r0 + r3 + 16], m4
8143
8144 lea r0, [r0 + r4]
8145
8146 palignr m4, m0, m2, 12
8147 movu [r0], m4
8148 palignr m4, m1, m0, 12
8149 movu [r0 + 16], m4
8150 palignr m4, m0, m2, 11
8151 movu [r0 + r1], m4
8152 palignr m4, m1, m0, 11
8153 movu [r0 + r1 + 16], m4
8154 palignr m4, m0, m2, 10
8155 movu [r0 + r2], m4
8156 palignr m4, m1, m0, 10
8157 movu [r0 + r2 + 16], m4
8158 palignr m4, m0, m2, 9
8159 movu [r0 + r3], m4
8160 palignr m4, m1, m0, 9
8161 movu [r0 + r3 + 16], m4
8162
8163 lea r0, [r0 + r4]
8164
8165 palignr m4, m0, m2, 8
8166 movu [r0], m4
8167 palignr m4, m1, m0, 8
8168 movu [r0 + 16], m4
8169 palignr m4, m0, m2, 7
8170 movu [r0 + r1], m4
8171 palignr m4, m1, m0, 7
8172 movu [r0 + r1 + 16], m4
8173 palignr m4, m0, m2, 6
8174 movu [r0 + r2], m4
8175 palignr m4, m1, m0, 6
8176 movu [r0 + r2 + 16], m4
8177 palignr m4, m0, m2, 5
8178 movu [r0 + r3], m4
8179 palignr m4, m1, m0, 5
8180 movu [r0 + r3 + 16], m4
8181
8182 lea r0, [r0 + r4]
8183
8184 palignr m4, m0, m2, 4
8185 movu [r0], m4
8186 palignr m4, m1, m0, 4
8187 movu [r0 + 16], m4
8188 palignr m4, m0, m2, 3
8189 movu [r0 + r1], m4
8190 palignr m4, m1, m0, 3
8191 movu [r0 + r1 + 16], m4
8192 palignr m4, m0, m2, 2
8193 movu [r0 + r2], m4
8194 palignr m4, m1, m0, 2
8195 movu [r0 + r2 + 16], m4
8196 palignr m4, m0, m2, 1
8197 movu [r0 + r3], m4
8198 palignr m4, m1, m0, 1
8199 movu [r0 + r3 + 16], m4
8200
8201 lea r0, [r0 + r4]
8202
8203 movu [r0], m2
8204 movu [r0 + 16], m0
8205 palignr m4, m2, m3, 15
8206 movu [r0 + r1], m4
8207 palignr m4, m0, m2, 15
8208 movu [r0 + r1 + 16], m4
8209 palignr m4, m2, m3, 14
8210 movu [r0 + r2], m4
8211 palignr m4, m0, m2, 14
8212 movu [r0 + r2 + 16], m4
8213 palignr m4, m2, m3, 13
8214 movu [r0 + r3], m4
8215 palignr m4, m0, m2, 13
8216 movu [r0 + r3 + 16], m4
8217
8218 lea r0, [r0 + r4]
8219
8220 palignr m4, m2, m3, 12
8221 movu [r0], m4
8222 palignr m4, m0, m2, 12
8223 movu [r0 + 16], m4
8224 palignr m4, m2, m3, 11
8225 movu [r0 + r1], m4
8226 palignr m4, m0, m2, 11
8227 movu [r0 + r1 + 16], m4
8228 palignr m4, m2, m3, 10
8229 movu [r0 + r2], m4
8230 palignr m4, m0, m2, 10
8231 movu [r0 + r2 + 16], m4
8232 palignr m4, m2, m3, 9
8233 movu [r0 + r3], m4
8234 palignr m4, m0, m2, 9
8235 movu [r0 + r3 + 16], m4
8236
8237 lea r0, [r0 + r4]
8238
8239 palignr m4, m2, m3, 8
8240 movu [r0], m4
8241 palignr m4, m0, m2, 8
8242 movu [r0 + 16], m4
8243 palignr m4, m2, m3, 7
8244 movu [r0 + r1], m4
8245 palignr m4, m0, m2, 7
8246 movu [r0 + r1 + 16], m4
8247 palignr m4, m2, m3, 6
8248 movu [r0 + r2], m4
8249 palignr m4, m0, m2, 6
8250 movu [r0 + r2 + 16], m4
8251 palignr m4, m2, m3, 5
8252 movu [r0 + r3], m4
8253 palignr m4, m0, m2, 5
8254 movu [r0 + r3 + 16], m4
8255
8256 lea r0, [r0 + r4]
8257
8258 palignr m4, m2, m3, 4
8259 movu [r0], m4
8260 palignr m4, m0, m2, 4
8261 movu [r0 + 16], m4
8262 palignr m4, m2, m3, 3
8263 movu [r0 + r1], m4
8264 palignr m4, m0, m2, 3
8265 movu [r0 + r1 + 16], m4
8266 palignr m4, m2, m3, 2
8267 movu [r0 + r2], m4
8268 palignr m4, m0, m2, 2
8269 movu [r0 + r2 + 16], m4
8270 palignr m4, m2, m3, 1
8271 movu [r0 + r3], m4
8272 palignr m4, m0, m2, 1
8273 movu [r0 + r3 + 16], m4
8274 RET
8275
8276 ;------------------------------------------------------------------------------------------------------------------
8277 ; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8278 ;------------------------------------------------------------------------------------------------------------------
8279 INIT_XMM sse4
8280 cglobal intra_pred_ang32_19, 4,7,8
8281 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8282 xchg r2, r3
8283 mov r6, rsp
8284 sub rsp, 64+gprsize
8285 and rsp, ~63
8286 mov [rsp+64], r6
8287
8288 ; collect reference pixel
8289 movu m0, [r3]
8290 movu m1, [r3 + 16]
8291 pshufb m0, [c_mode32_17_0]
8292 pshufb m1, [c_mode32_17_0]
8293 mova [rsp ], m1
8294 movu [rsp + 13], m0
8295 movu m0, [r2 + 1]
8296 movu m1, [r2 + 1 + 16]
8297 movu [rsp + 26], m0
8298 movu [rsp + 26 + 16], m1
8299 mov [rsp + 63], byte 4
8300
8301 ; filter
8302 lea r2, [rsp + 25] ; r2 -> [0]
8303 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8304 lea r4, [ang_table] ; r4 -> ang_table
8305 lea r5, [r1 * 3] ; r5 -> 3 * stride
8306 lea r6, [r0] ; r6 -> r0
8307 mova m5, [pw_1024] ; m5 -> 1024
8308 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8309
8310 .loop:
8311 ; Row[0 - 7]
8312 movu m7, [r2 - 6]
8313 palignr m0, m7, 6
8314 palignr m1, m7, 5
8315 palignr m2, m7, 4
8316 palignr m3, m7, 3
8317 palignr m4, m7, 2
8318 mova m5, m4
8319 palignr m6, m7, 1
8320 PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16
8321
8322 ; Row[7 - 15]
8323 movu m7, [r2 - 12]
8324 palignr m0, m7, 5
8325 palignr m1, m7, 4
8326 mova m2, m1
8327 palignr m3, m7, 3
8328 palignr m4, m7, 2
8329 palignr m5, m7, 1
8330 mova m6, m7
8331 lea r0, [r0 + r1 * 4]
8332 PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0
8333
8334 ; Row[16 - 23]
8335 movu m7, [r2 - 19]
8336 palignr m0, m7, 6
8337 palignr m1, m7, 5
8338 palignr m2, m7, 4
8339 palignr m3, m7, 3
8340 palignr m4, m7, 2
8341 mova m5, m4
8342 palignr m6, m7, 1
8343 lea r0, [r0 + r1 * 4]
8344 PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16
8345
8346 ; Row[24 - 31]
8347 movu m7, [r2 - 25]
8348 palignr m0, m7, 5
8349 palignr m1, m7, 4
8350 mova m2, m1
8351 palignr m3, m7, 3
8352 palignr m4, m7, 2
8353 palignr m5, m7, 1
8354 mova m6, m7
8355 lea r0, [r0 + r1 * 4]
8356 PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0
8357
8358 add r6, 8
8359 mov r0, r6
8360 add r2, 8
8361 dec byte [rsp + 63]
8362 jnz .loop
8363 mov rsp, [rsp+64]
8364 RET
8365
8366 ;-------------------------------------------------------------------------------------------------------------------
8367 ; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8368 ;-------------------------------------------------------------------------------------------------------------------
8369 INIT_XMM sse4
8370 cglobal intra_pred_ang32_20, 4,7,8
8371 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8372 xchg r2, r3
8373 mov r6, rsp
8374 sub rsp, 64+gprsize
8375 and rsp, ~63
8376 mov [rsp+64], r6
8377
8378 ; collect reference pixel
8379 movu m0, [r3]
8380 movu m1, [r3 + 15]
8381 pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
8382 pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
8383 mova [rsp], m1
8384 movu [rsp + 10], m0
8385 movu m0, [r2 + 1]
8386 movu m1, [r2 + 1 + 16]
8387 movu [rsp + 21], m0
8388 movu [rsp + 21 + 16], m1
8389 mov [rsp + 63], byte 4
8390
8391 ; filter
8392 lea r2, [rsp + 21] ; r2 -> [0]
8393 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8394 lea r4, [ang_table] ; r4 -> ang_table
8395 lea r5, [r1 * 3] ; r5 -> 3 * stride
8396 lea r6, [r0] ; r6 -> r0
8397 mova m5, [pw_1024] ; m5 -> 1024
8398 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8399
8400 .loop:
8401 ; Row[0 - 7]
8402 movu m7, [r2 - 6]
8403 palignr m0, m7, 5
8404 palignr m1, m7, 4
8405 mova m2, m1
8406 palignr m3, m7, 3
8407 palignr m4, m7, 2
8408 mova m5, m4
8409 palignr m6, m7, 1
8410 PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24
8411
8412 ; Row[8 - 15]
8413 movu m7, [r2 - 11]
8414 palignr m0, m7, 5
8415 palignr m1, m7, 4
8416 palignr m2, m7, 3
8417 mova m3, m2
8418 palignr m4, m7, 2
8419 palignr m5, m7, 1
8420 mova m6, m5
8421 lea r0, [r0 + r1 * 4]
8422 PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16
8423
8424 ; Row[16 - 23]
8425 movu m7, [r2 - 16]
8426 palignr m0, m7, 4
8427 mova m1, m0
8428 palignr m2, m7, 3
8429 palignr m3, m7, 2
8430 mova m4, m3
8431 palignr m5, m7, 1
8432 mova m6, m7
8433 lea r0, [r0 + r1 * 4]
8434 PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8
8435
8436 ; Row[24 - 31]
8437 movu m7, [r2 - 21]
8438 palignr m0, m7, 4
8439 palignr m1, m7, 3
8440 mova m2, m1
8441 palignr m3, m7, 2
8442 palignr m4, m7, 1
8443 mova m5, m4
8444 mova m6, m7
8445 lea r0, [r0 + r1 * 4]
8446 PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0
8447
8448 add r6, 8
8449 mov r0, r6
8450 add r2, 8
8451 dec byte [rsp + 63]
8452 jnz .loop
8453 mov rsp, [rsp+64]
8454 RET
8455
8456 ;-------------------------------------------------------------------------------------------------------------------
8457 ; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8458 ;-------------------------------------------------------------------------------------------------------------------
8459 INIT_XMM sse4
8460 cglobal intra_pred_ang32_21, 4,7,8
8461 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8462 xchg r2, r3
8463 mov r6, rsp
8464 sub rsp, 64+gprsize
8465 and rsp, ~63
8466 mov [rsp+64], r6
8467
8468 ; collect reference pixel
8469 movu m0, [r3]
8470 movu m1, [r3 + 15]
8471 pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
8472 pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
8473 mova [rsp], m1
8474 movu [rsp + 8], m0
8475 movu m0, [r2 + 1]
8476 movu m1, [r2 + 1 + 16]
8477 movu [rsp + 17], m0
8478 movu [rsp + 17 + 16], m1
8479 mov [rsp + 63], byte 4
8480
8481 ; filter
8482 lea r2, [rsp + 17] ; r2 -> [0]
8483 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8484 lea r4, [ang_table] ; r4 -> ang_table
8485 lea r5, [r1 * 3] ; r5 -> 3 * stride
8486 lea r6, [r0] ; r6 -> r0
8487 mova m5, [pw_1024] ; m5 -> 1024
8488 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8489
8490 .loop:
8491 ; Row[0 - 7]
8492 movu m7, [r2 - 5]
8493 palignr m0, m7, 4
8494 palignr m1, m7, 3
8495 mova m2, m1
8496 palignr m3, m7, 2
8497 mova m4, m3
8498 palignr m5, m7, 1
8499 mova m6, m5
8500 PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24
8501
8502 ; Row[8 - 15]
8503 movu m7, [r2 - 9]
8504 palignr m0, m7, 4
8505 palignr m1, m7, 3
8506 mova m2, m1
8507 palignr m3, m7, 2
8508 mova m4, m3
8509 palignr m5, m7, 1
8510 mova m6, m5
8511 lea r0, [r0 + r1 * 4]
8512 PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16
8513
8514 ; Row[16 - 23]
8515 movu m7, [r2 - 13]
8516 palignr m0, m7, 3
8517 mova m1, m0
8518 palignr m2, m7, 2
8519 mova m3, m2
8520 palignr m4, m7, 1
8521 mova m5, m4
8522 mova m6, m7
8523 lea r0, [r0 + r1 * 4]
8524 PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8
8525
8526 ; Row[24 - 31]
8527 movu m7, [r2 - 17]
8528 palignr m0, m7, 3
8529 mova m1, m0
8530 palignr m2, m7, 2
8531 mova m3, m2
8532 palignr m4, m7, 1
8533 mova m5, m4
8534 mova m6, m7
8535 lea r0, [r0 + r1 * 4]
8536 PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0
8537
8538 add r6, 8
8539 mov r0, r6
8540 add r2, 8
8541 dec byte [rsp + 63]
8542 jnz .loop
8543 mov rsp, [rsp+64]
8544 RET
8545
8546 ;-------------------------------------------------------------------------------------------------------------------
8547 ; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8548 ;-------------------------------------------------------------------------------------------------------------------
8549 INIT_XMM sse4
8550 cglobal intra_pred_ang32_22, 4,7,8
8551 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8552
8553 xchg r2, r3
8554 mov r6, rsp
8555 sub rsp, 64+gprsize
8556 and rsp, ~63
8557 mov [rsp+64], r6
8558
8559 ; collect reference pixel
8560 movu m0, [r3]
8561 movu m1, [r3 + 15]
8562 pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
8563 pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
8564 pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
8565 palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
8566 mova [rsp], m0
8567 movu m0, [r2 + 1]
8568 movu m1, [r2 + 1 + 16]
8569 movu [rsp + 13], m0
8570 movu [rsp + 13 + 16], m1
8571 mov [rsp + 63], byte 4
8572
8573 ; filter
8574 lea r2, [rsp + 13] ; r2 -> [0]
8575 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8576 lea r4, [ang_table] ; r4 -> ang_table
8577 lea r5, [r1 * 3] ; r5 -> 3 * stride
8578 lea r6, [r0] ; r6 -> r0
8579 mova m5, [pw_1024] ; m5 -> 1024
8580 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8581
8582 .loop:
8583 ; Row[0 - 7]
8584 movu m7, [r2 - 4]
8585 palignr m0, m7, 3
8586 mova m1, m0
8587 palignr m2, m7, 2
8588 mova m3, m2
8589 palignr m4, m7, 1
8590 mova m5, m4
8591 mova m6, m4
8592 PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24
8593
8594 ; Row[8 - 15]
8595 movu m7, [r2 - 7]
8596 palignr m0, m7, 3
8597 palignr m1, m7, 2
8598 mova m2, m1
8599 mova m3, m1
8600 palignr m4, m7, 1
8601 mova m5, m4
8602 mova m6, m7
8603 lea r0, [r0 + r1 * 4]
8604 PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16
8605
8606 ; Row[16 - 23]
8607 movu m7, [r2 - 10]
8608 palignr m0, m7, 3
8609 palignr m1, m7, 2
8610 mova m2, m1
8611 palignr m3, m7, 1
8612 mova m4, m3
8613 mova m5, m3
8614 mova m6, m7
8615 lea r0, [r0 + r1 * 4]
8616 PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8
8617
8618 ; Row[24 - 31]
8619 movu m7, [r2 - 13]
8620 palignr m0, m7, 2
8621 mova m1, m0
8622 mova m2, m0
8623 palignr m3, m7, 1
8624 mova m4, m3
8625 mova m5, m7
8626 mova m6, m7
8627 lea r0, [r0 + r1 * 4]
8628 PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0
8629
8630 add r6, 8
8631 mov r0, r6
8632 add r2, 8
8633 dec byte [rsp + 63]
8634 jnz .loop
8635 mov rsp, [rsp+64]
8636 RET
8637
8638 ;-----------------------------------------------------------------------------------------------------------------
8639 ; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8640 ;-----------------------------------------------------------------------------------------------------------------
8641 INIT_XMM sse4
8642 cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
8643 %define above [rsp + 0 * mmsize]
8644 xchg r2, r3
8645 lea r4, [ang_table + 16 * 16]
8646 lea r5, [r1 * 3] ; r5 -> 3 * stride
8647 mov r6, r0
8648 mova m7, [pw_1024]
8649
8650 MODE_13_23_ROW0 0
8651 add r6, 8
8652 mov r0, r6
8653 add r2, 7
8654 mov r3, 3
8655 .loop:
8656 MODE_13_23 0
8657 add r6, 8
8658 mov r0, r6
8659 add r2, 8
8660 dec r3
8661 jnz .loop
8662 RET
8663
8664 ;-----------------------------------------------------------------------------------------------------------------
8665 ; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8666 ;-----------------------------------------------------------------------------------------------------------------
8667 INIT_XMM sse4
8668 cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize)
8669 %define above [rsp + 0 * mmsize]
8670 xchg r2, r3
8671 lea r4, [ang_table + 16 * 16]
8672 lea r5, [r1 * 3] ; r5 -> 3 * stride
8673 mov r6, r0
8674 mova m7, [pw_1024]
8675
8676 MODE_12_24_ROW0 0
8677 add r6, 8
8678 mov r0, r6
8679 add r2, 7
8680 mov r3, 3
8681 .loop:
8682 MODE_12_24 0
8683 add r6, 8
8684 mov r0, r6
8685 add r2, 8
8686 dec r3
8687 jnz .loop
8688 RET
8689
8690 ;-------------------------------------------------------------------------------------------------------------------
8691 ; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8692 ;-------------------------------------------------------------------------------------------------------------------
8693 INIT_XMM sse4
8694 cglobal intra_pred_ang32_25, 4,7,8
8695 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8696 xchg r2, r3
8697 mov r6, rsp
8698 sub rsp, 64+gprsize
8699 and rsp, ~63
8700 mov [rsp+64], r6
8701
8702 ; collect reference pixel
8703 movu m0, [r3 + 16]
8704 pxor m1, m1
8705 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
8706 mova [rsp], m0
8707 movu m0, [r2]
8708 movu m1, [r2 + 16]
8709 movu m2, [r2 + 32]
8710 movu [rsp + 1], m0
8711 movu [rsp + 1 + 16], m1
8712 movu [rsp + 1 + 32], m2
8713 mov [rsp + 63], byte 4
8714
8715 ; filter
8716 lea r2, [rsp + 1] ; r2 -> [0]
8717 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8718 lea r4, [ang_table] ; r4 -> ang_table
8719 lea r5, [r1 * 3] ; r5 -> 3 * stride
8720 lea r6, [r0] ; r6 -> r0
8721 mova m5, [pw_1024] ; m5 -> 1024
8722 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8723
8724 .loop:
8725 ; Row[0 - 7]
8726 movu m7, [r2]
8727 mova m0, m7
8728 mova m1, m7
8729 mova m2, m7
8730 mova m3, m7
8731 mova m4, m7
8732 mova m5, m7
8733 mova m6, m7
8734 PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16
8735
8736 ; Row[8 - 15]
8737 movu m7, [r2]
8738 mova m0, m7
8739 mova m1, m7
8740 mova m2, m7
8741 mova m3, m7
8742 mova m4, m7
8743 mova m5, m7
8744 mova m6, m7
8745 lea r0, [r0 + r1 * 4]
8746 PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0
8747
8748 ; Row[16 - 23]
8749 movu m7, [r2 - 1]
8750 mova m0, m7
8751 mova m1, m7
8752 mova m2, m7
8753 mova m3, m7
8754 mova m4, m7
8755 mova m5, m7
8756 mova m6, m7
8757 lea r0, [r0 + r1 * 4]
8758 PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16
8759
8760 ; Row[24 - 31]
8761 movu m7, [r2 - 1]
8762 mova m0, m7
8763 mova m1, m7
8764 mova m2, m7
8765 mova m3, m7
8766 mova m4, m7
8767 mova m5, m7
8768 mova m6, m7
8769 lea r0, [r0 + r1 * 4]
8770 PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0
8771
8772 add r6, 8
8773 mov r0, r6
8774 add r2, 8
8775 dec byte [rsp + 63]
8776 jnz .loop
8777 mov rsp, [rsp+64]
8778 RET
8779
8780 ;------------------------------------------------------------------------------------------------------------------
8781 ; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8782 ;------------------------------------------------------------------------------------------------------------------
8783 INIT_XMM sse4
8784 cglobal intra_pred_ang32_26, 6,7,7,0-(2*mmsize)
8785 %define m8 [rsp + 0 * mmsize]
8786 %define m9 [rsp + 1 * mmsize]
8787 lea r4, [r1 * 3]
8788 mov r6, 2
8789 movu m0, [r2]
8790 movu m1, [r2 + 1]
8791 mova m8, m0
8792 mova m9, m1
8793 mov r2d, r5d
8794
8795 .loop:
8796 movu m0, [r3 + 1]
8797
8798 movu [r0], m0
8799 movu [r0 + r1], m0
8800 movu [r0 + r1 * 2], m0
8801 movu [r0 + r4], m0
8802 lea r5, [r0 + r1 * 4]
8803 movu [r5], m0
8804 movu [r5 + r1], m0
8805 movu [r5 + r1 * 2], m0
8806 movu [r5 + r4], m0
8807 lea r5, [r5 + r1 * 4]
8808 movu [r5], m0
8809 movu [r5 + r1], m0
8810 movu [r5 + r1 * 2], m0
8811 movu [r5 + r4], m0
8812 lea r5, [r5 + r1 * 4]
8813 movu [r5], m0
8814 movu [r5 + r1], m0
8815 movu [r5 + r1 * 2], m0
8816 movu [r5 + r4], m0
8817 lea r5, [r0 + r1 * 4]
8818 movu [r5], m0
8819 movu [r5 + r1], m0
8820 movu [r5 + r1 * 2], m0
8821 movu [r5 + r4], m0
8822 lea r5, [r5 + r1 * 4]
8823 movu [r5], m0
8824 movu [r5 + r1], m0
8825 movu [r5 + r1 * 2], m0
8826 movu [r5 + r4], m0
8827 lea r5, [r5 + r1 * 4]
8828 movu [r5], m0
8829 movu [r5 + r1], m0
8830 movu [r5 + r1 * 2], m0
8831 movu [r5 + r4], m0
8832 lea r5, [r5 + r1 * 4]
8833 movu [r5], m0
8834 movu [r5 + r1], m0
8835 movu [r5 + r1 * 2], m0
8836 movu [r5 + r4], m0
8837 lea r5, [r5 + r1 * 4]
8838 movu [r5], m0
8839 movu [r5 + r1], m0
8840 movu [r5 + r1 * 2], m0
8841 movu [r5 + r4], m0
8842 lea r5, [r5 + r1 * 4]
8843 movu [r5], m0
8844 movu [r5 + r1], m0
8845 movu [r5 + r1 * 2], m0
8846 movu [r5 + r4], m0
8847 lea r5, [r5 + r1 * 4]
8848 movu [r5], m0
8849 movu [r5 + r1], m0
8850 movu [r5 + r1 * 2], m0
8851 movu [r5 + r4], m0
8852
8853 ; filter
8854 cmp r2d, byte 0
8855 jz .quit
8856
8857 pxor m4, m4
8858 pshufb m0, m4
8859 pmovzxbw m0, m0
8860 mova m1, m0
8861 movu m2, m8
8862 movu m3, m9
8863
8864 pshufb m2, m4
8865 pmovzxbw m2, m2
8866 movhlps m4, m3
8867 pmovzxbw m3, m3
8868 pmovzxbw m4, m4
8869 psubw m3, m2
8870 psubw m4, m2
8871 psraw m3, 1
8872 psraw m4, 1
8873 paddw m0, m3
8874 paddw m1, m4
8875 packuswb m0, m1
8876
8877 pextrb [r0], m0, 0
8878 pextrb [r0 + r1], m0, 1
8879 pextrb [r0 + r1 * 2], m0, 2
8880 pextrb [r0 + r4], m0, 3
8881 lea r5, [r0 + r1 * 4]
8882 pextrb [r5], m0, 4
8883 pextrb [r5 + r1], m0, 5
8884 pextrb [r5 + r1 * 2], m0, 6
8885 pextrb [r5 + r4], m0, 7
8886 lea r5, [r5 + r1 * 4]
8887 pextrb [r5], m0, 8
8888 pextrb [r5 + r1], m0, 9
8889 pextrb [r5 + r1 * 2], m0, 10
8890 pextrb [r5 + r4], m0, 11
8891 lea r5, [r5 + r1 * 4]
8892 pextrb [r5], m0, 12
8893 pextrb [r5 + r1], m0, 13
8894 pextrb [r5 + r1 * 2], m0, 14
8895 pextrb [r5 + r4], m0, 15
8896
8897 .quit:
8898 lea r3, [r3 + 16]
8899 add r0, 16
8900 dec r6d
8901 jnz .loop
8902 RET
8903
8904 ;------------------------------------------------------------------------------------------------------------------
8905 ; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8906 ;------------------------------------------------------------------------------------------------------------------
8907 INIT_XMM sse4
8908 cglobal intra_pred_ang32_27, 3,7,8
8909 mov r2, r3mp
8910 lea r3, [ang_table + 16 * 16]
8911 mov r4d, 4
8912 lea r5, [r1 * 3]
8913 mov r6, r0
8914 mova m7, [pw_1024]
8915 .loop:
8916 MODE_9_27 0
8917 add r6, 8
8918 mov r0, r6
8919 add r2, 8
8920 dec r4
8921 jnz .loop
8922 RET
8923
8924 ;------------------------------------------------------------------------------------------------------------------
8925 ; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8926 ;------------------------------------------------------------------------------------------------------------------
8927 INIT_XMM sse4
8928 cglobal intra_pred_ang32_28, 3,7,8
8929 mov r2, r3mp
8930 lea r3, [ang_table + 16 * 16]
8931 mov r4d, 4
8932 lea r5, [r1 * 3]
8933 mov r6, r0
8934 mova m7, [pw_1024]
8935 .loop:
8936 MODE_8_28 0
8937 add r6, 8
8938 mov r0, r6
8939 add r2, 8
8940 dec r4
8941 jnz .loop
8942 RET
8943
8944 ;------------------------------------------------------------------------------------------------------------------
8945 ; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8946 ;------------------------------------------------------------------------------------------------------------------
8947 INIT_XMM sse4
8948 cglobal intra_pred_ang32_29, 3,7,8
8949 mov r2, r3mp
8950 lea r3, [ang_table + 16 * 16]
8951 mov r4d, 4
8952 lea r5, [r1 * 3]
8953 mov r6, r0
8954 mova m7, [pw_1024]
8955 .loop:
8956 MODE_7_29 0
8957 add r6, 8
8958 mov r0, r6
8959 add r2, 8
8960 dec r4
8961 jnz .loop
8962 RET
8963
8964 ;------------------------------------------------------------------------------------------------------------------
8965 ; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8966 ;------------------------------------------------------------------------------------------------------------------
8967 INIT_XMM sse4
8968 cglobal intra_pred_ang32_30, 3,7,8
8969 mov r2, r3mp
8970 lea r3, [ang_table + 16 * 16]
8971 mov r4d, 4
8972 lea r5, [r1 * 3]
8973 mov r6, r0
8974 mova m7, [pw_1024]
8975 .loop:
8976 MODE_6_30 0
8977 add r6, 8
8978 mov r0, r6
8979 add r2, 8
8980 dec r4
8981 jnz .loop
8982 RET
8983
8984 ;------------------------------------------------------------------------------------------------------------------
8985 ; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8986 ;------------------------------------------------------------------------------------------------------------------
8987 INIT_XMM sse4
8988 cglobal intra_pred_ang32_31, 3,7,8
8989 mov r2, r3mp
8990 lea r3, [ang_table + 16 * 16]
8991 mov r4d, 4
8992 lea r5, [r1 * 3]
8993 mov r6, r0
8994 mova m7, [pw_1024]
8995 .loop:
8996 MODE_5_31 0
8997 add r6, 8
8998 mov r0, r6
8999 add r2, 8
9000 dec r4
9001 jnz .loop
9002 RET
9003
9004 ;-----------------------------------------------------------------------------------------------------------------
9005 ; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9006 ;-----------------------------------------------------------------------------------------------------------------
9007 INIT_XMM sse4
9008 cglobal intra_pred_ang32_32, 3,7,8
9009 mov r2, r3mp
9010 lea r3, [ang_table + 16 * 16]
9011 mov r4d, 4
9012 lea r5, [r1 * 3]
9013 mov r6, r0
9014 mova m7, [pw_1024]
9015 .loop:
9016 MODE_4_32 0
9017 add r6, 8
9018 mov r0, r6
9019 add r2, 8
9020 dec r4
9021 jnz .loop
9022 RET
9023
9024 ;------------------------------------------------------------------------------------------------------------------
9025 ; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9026 ;------------------------------------------------------------------------------------------------------------------
9027 INIT_XMM sse4
9028 cglobal intra_pred_ang32_33, 3,7,8
9029 xchg r2, r3mp
9030 lea r3, [ang_table + 16 * 16]
9031 mov r4d, 4
9032 lea r5, [r1 * 3]
9033 mov r6, r0
9034 mova m7, [pw_1024]
9035 .loop:
9036 MODE_3_33 0
9037 add r6, 8
9038 mov r0, r6
9039 add r2, 8
9040 dec r4
9041 jnz .loop
9042 RET
9043
9044 ;-----------------------------------------------------------------------------
9045 ; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
9046 ;-----------------------------------------------------------------------------
9047 INIT_XMM sse4
9048 cglobal all_angs_pred_4x4, 6, 6, 8
9049
9050 ; mode 2
9051
9052 movh m0, [r2 + 2]
9053 movd [r0], m0
9054
9055 palignr m1, m0, 1
9056 movd [r0 + 4], m1
9057
9058 palignr m1, m0, 2
9059 movd [r0 + 8], m1
9060
9061 psrldq m0, 3
9062 movd [r0 + 12], m0
9063
9064 ; mode 3
9065
9066 mova m0, [pw_1024]
9067
9068 movh m1, [r2 + 1]
9069
9070 palignr m2, m1, 1
9071 punpcklbw m1, m2
9072
9073 lea r5, [ang_table]
9074
9075 pmaddubsw m5, m1, [r5 + 26 * 16]
9076 pmulhrsw m5, m0
9077 packuswb m5, m5
9078 movd [r0 + 16], m5
9079
9080 palignr m2, m1, 2
9081
9082 mova m7, [r5 + 20 * 16]
9083
9084 pmaddubsw m6, m2, m7
9085 pmulhrsw m6, m0
9086 packuswb m6, m6
9087 movd [r0 + 20], m6
9088
9089 palignr m3, m1, 4
9090
9091 pmaddubsw m4, m3, [r5 + 14 * 16]
9092 pmulhrsw m4, m0
9093 packuswb m4, m4
9094 movd [r0 + 24], m4
9095
9096 palignr m4, m1, 6
9097
9098 pmaddubsw m4, [r5 + 8 * 16]
9099 pmulhrsw m4, m0
9100 packuswb m4, m4
9101 movd [r0 + 28], m4
9102
9103 ; mode 4
9104
9105 pmaddubsw m4, m1, [r5 + 21 * 16]
9106 pmulhrsw m4, m0
9107 packuswb m4, m4
9108 movd [r0 + 32], m4
9109
9110 pmaddubsw m4, m2, [r5 + 10 * 16]
9111 pmulhrsw m4, m0
9112 packuswb m4, m4
9113 movd [r0 + 36], m4
9114
9115 pmaddubsw m4, m2, [r5 + 31 * 16]
9116 pmulhrsw m4, m0
9117 packuswb m4, m4
9118 movd [r0 + 40], m4
9119
9120 pmaddubsw m4, m3, m7
9121 pmulhrsw m4, m0
9122 packuswb m4, m4
9123 movd [r0 + 44], m4
9124
9125 ; mode 5
9126
9127 pmaddubsw m4, m1, [r5 + 17 * 16]
9128 pmulhrsw m4, m0
9129 packuswb m4, m4
9130 movd [r0 + 48], m4
9131
9132 pmaddubsw m4, m2, [r5 + 2 * 16]
9133 pmulhrsw m4, m0
9134 packuswb m4, m4
9135 movd [r0 + 52], m4
9136
9137 pmaddubsw m4, m2, [r5 + 19 * 16]
9138 pmulhrsw m4, m0
9139 packuswb m4, m4
9140 movd [r0 + 56], m4
9141
9142 pmaddubsw m3, [r5 + 4 * 16]
9143 pmulhrsw m3, m0
9144 packuswb m3, m3
9145 movd [r0 + 60], m3
9146
9147 ; mode 6
9148
9149 pmaddubsw m3, m1, [r5 + 13 * 16]
9150 pmulhrsw m3, m0
9151 packuswb m3, m3
9152 movd [r0 + 64], m3
9153
9154 movd [r0 + 68], m5
9155
9156 pmaddubsw m3, m2, [r5 + 7 * 16]
9157 pmulhrsw m3, m0
9158 packuswb m3, m3
9159 movd [r0 + 72], m3
9160
9161 movd [r0 + 76], m6
9162
9163 ; mode 7
9164
9165 pmaddubsw m3, m1, [r5 + 9 * 16]
9166 pmulhrsw m3, m0
9167 packuswb m3, m3
9168 movd [r0 + 80], m3
9169
9170 pmaddubsw m3, m1, [r5 + 18 * 16]
9171 pmulhrsw m3, m0
9172 packuswb m3, m3
9173 movd [r0 + 84], m3
9174
9175 pmaddubsw m3, m1, [r5 + 27 * 16]
9176 pmulhrsw m3, m0
9177 packuswb m3, m3
9178 movd [r0 + 88], m3
9179
9180 pmaddubsw m2, [r5 + 4 * 16]
9181 pmulhrsw m2, m0
9182 packuswb m2, m2
9183 movd [r0 + 92], m2
9184
9185 ; mode 8
9186
9187 pmaddubsw m2, m1, [r5 + 5 * 16]
9188 pmulhrsw m2, m0
9189 packuswb m2, m2
9190 movd [r0 + 96], m2
9191
9192 pmaddubsw m2, m1, [r5 + 10 * 16]
9193 pmulhrsw m2, m0
9194 packuswb m2, m2
9195 movd [r0 + 100], m2
9196
9197 pmaddubsw m2, m1, [r5 + 15 * 16]
9198 pmulhrsw m2, m0
9199 packuswb m2, m2
9200 movd [r0 + 104], m2
9201
9202 pmaddubsw m2, m1, m7
9203 pmulhrsw m2, m0
9204 packuswb m2, m2
9205 movd [r0 + 108], m2
9206
9207 ; mode 9
9208
9209 pmaddubsw m2, m1, [r5 + 2 * 16]
9210 pmulhrsw m2, m0
9211 packuswb m2, m2
9212 movd [r0 + 112], m2
9213
9214 pmaddubsw m2, m1, [r5 + 4 * 16]
9215 pmulhrsw m2, m0
9216 packuswb m2, m2
9217 movd [r0 + 116], m2
9218
9219 pmaddubsw m2, m1, [r5 + 6 * 16]
9220 pmulhrsw m2, m0
9221 packuswb m2, m2
9222 movd [r0 + 120], m2
9223
9224 pmaddubsw m1, [r5 + 8 * 16]
9225 pmulhrsw m1, m0
9226 packuswb m1, m1
9227 movd [r0 + 124], m1
9228
9229 ; mode 10
9230
9231 movh m1, [r2]
9232 palignr m2, m1, 1
9233 pshufd m3, m2, 0
9234 movu [r0 + 128], m3
9235
9236 pxor m3, m3
9237
9238 pshufb m4, m2, m3
9239 punpcklbw m4, m3
9240
9241 movh m5, [r1]
9242
9243 pshufb m6, m5, m3
9244 punpcklbw m6, m3
9245
9246 psrldq m5, 1
9247 punpcklbw m5, m3
9248
9249 psubw m5, m6
9250 psraw m5, 1
9251
9252 paddw m4, m5
9253
9254 packuswb m4, m3
9255
9256 pextrb [r0 + 128], m4, 0
9257 pextrb [r0 + 132], m4, 1
9258 pextrb [r0 + 136], m4, 2
9259 pextrb [r0 + 140], m4, 3
9260
9261 ; mode 11
9262
9263 palignr m2, m1, 1
9264 punpcklbw m1, m2
9265
9266 pmaddubsw m2, m1, [r5 + 30 * 16]
9267 pmulhrsw m2, m0
9268 packuswb m2, m2
9269 movd [r0 + 144], m2
9270
9271 pmaddubsw m2, m1, [r5 + 28 * 16]
9272 pmulhrsw m2, m0
9273 packuswb m2, m2
9274 movd [r0 + 148], m2
9275
9276 pmaddubsw m2, m1, [r5 + 26 * 16]
9277 pmulhrsw m2, m0
9278 packuswb m2, m2
9279 movd [r0 + 152], m2
9280
9281 pmaddubsw m2, m1, [r5 + 24 * 16]
9282 pmulhrsw m2, m0
9283 packuswb m2, m2
9284 movd [r0 + 156], m2
9285
9286 ; mode 12
9287
9288 pmaddubsw m2, m1, [r5 + 27 * 16]
9289 pmulhrsw m2, m0
9290 packuswb m2, m2
9291 movd [r0 + 160], m2
9292
9293 pmaddubsw m2, m1, [r5 + 22 * 16]
9294 pmulhrsw m2, m0
9295 packuswb m2, m2
9296 movd [r0 + 164], m2
9297
9298 pmaddubsw m2, m1, [r5 + 17 * 16]
9299 pmulhrsw m2, m0
9300 packuswb m2, m2
9301 movd [r0 + 168], m2
9302
9303 pmaddubsw m2, m1, [r5 + 12 * 16]
9304 pmulhrsw m2, m0
9305 packuswb m2, m2
9306 movd [r0 + 172], m2
9307
9308 ; mode 13
9309
9310 pmaddubsw m2, m1, [r5 + 23 * 16]
9311 pmulhrsw m2, m0
9312 packuswb m2, m2
9313 movd [r0 + 176], m2
9314
9315 pmaddubsw m2, m1, [r5 + 14 * 16]
9316 pmulhrsw m2, m0
9317 packuswb m2, m2
9318 movd [r0 + 180], m2
9319
9320 pmaddubsw m2, m1, [r5 + 5 * 16]
9321 pmulhrsw m2, m0
9322 packuswb m2, m2
9323 movd [r0 + 184], m2
9324
9325 pslldq m2, m1, 2
9326 pinsrb m2, [r1 + 0], 1
9327 pinsrb m2, [r1 + 4], 0
9328
9329 pmaddubsw m3, m2, [r5 + 28 * 16]
9330 pmulhrsw m3, m0
9331 packuswb m3, m3
9332 movd [r0 + 188], m3
9333
9334 ; mode 14
9335
9336 pmaddubsw m3, m1, [r5 + 19 * 16]
9337 pmulhrsw m3, m0
9338 packuswb m3, m3
9339 movd [r0 + 192], m3
9340
9341 pmaddubsw m5, m1, [r5 + 6 * 16]
9342 pmulhrsw m5, m0
9343 packuswb m5, m5
9344 movd [r0 + 196], m5
9345
9346 pinsrb m2, [r1 + 2], 0
9347
9348 pmaddubsw m3, m2, [r5 + 25 * 16]
9349 pmulhrsw m3, m0
9350 packuswb m3, m3
9351 movd [r0 + 200], m3
9352
9353 pmaddubsw m3, m2, [r5 + 12 * 16]
9354 pmulhrsw m3, m0
9355 packuswb m3, m3
9356 movd [r0 + 204], m3
9357
9358 ; mode 15
9359
9360 pmaddubsw m3, m1, [r5 + 15 * 16]
9361 pmulhrsw m3, m0
9362 packuswb m3, m3
9363 movd [r0 + 208], m3
9364
9365 pmaddubsw m3, m2, [r5 + 30 * 16]
9366 pmulhrsw m3, m0
9367 packuswb m3, m3
9368 movd [r0 + 212], m3
9369
9370 pmaddubsw m3, m2, [r5 + 13 * 16]
9371 pmulhrsw m3, m0
9372 packuswb m3, m3
9373 movd [r0 + 216], m3
9374
9375 pslldq m3, m2, 2
9376 pinsrb m3, [r1 + 2], 1
9377 pinsrb m3, [r1 + 4], 0
9378
9379 pmaddubsw m4, m3, [r5 + 28 * 16]
9380 pmulhrsw m4, m0
9381 packuswb m4, m4
9382 movd [r0 + 220], m4
9383
9384 ; mode 16
9385
9386 pmaddubsw m4, m1, [r5 + 11 * 16]
9387 pmulhrsw m4, m0
9388 packuswb m4, m4
9389 movd [r0 + 224], m4
9390
9391 pmaddubsw m4, m2, [r5 + 22 * 16]
9392 pmulhrsw m4, m0
9393 packuswb m4, m4
9394 movd [r0 + 228], m4
9395
9396 pmaddubsw m4, m2, [r5 + 1 * 16]
9397 pmulhrsw m4, m0
9398 packuswb m4, m4
9399 movd [r0 + 232], m4
9400
9401 pinsrb m3, [r1 + 3], 0
9402
9403 pmaddubsw m3, [r5 + 12 * 16]
9404 pmulhrsw m3, m0
9405 packuswb m3, m3
9406 movd [r0 + 236], m3
9407
9408 ; mode 17
9409
9410 movd [r0 + 240], m5
9411
9412 pslldq m1, 2
9413 pinsrb m1, [r1 + 1], 0
9414 pinsrb m1, [r1 + 0], 1
9415
9416 pmaddubsw m2, m1, [r5 + 12 * 16]
9417 pmulhrsw m2, m0
9418 packuswb m2, m2
9419 movd [r0 + 244], m2
9420
9421 pslldq m1, 2
9422 pinsrb m1, [r1 + 2], 0
9423 pinsrb m1, [r1 + 1], 1
9424
9425 pmaddubsw m2, m1, [r5 + 18 * 16]
9426 pmulhrsw m2, m0
9427 packuswb m2, m2
9428 movd [r0 + 248], m2
9429
9430 pslldq m1, 2
9431 pinsrb m1, [r1 + 4], 0
9432 pinsrb m1, [r1 + 2], 1
9433
9434 pmaddubsw m1, [r5 + 24 * 16]
9435 pmulhrsw m1, m0
9436 packuswb m1, m1
9437 movd [r0 + 252], m1
9438
9439 ; mode 18
9440
9441 movh m1, [r1]
9442 movd [r0 + 256], m1
9443
9444 pslldq m2, m1, 1
9445 pinsrb m2, [r2 + 1], 0
9446 movd [r0 + 260], m2
9447
9448 pslldq m3, m2, 1
9449 pinsrb m3, [r2 + 2], 0
9450 movd [r0 + 264], m3
9451
9452 pslldq m4, m3, 1
9453 pinsrb m4, [r2 + 3], 0
9454 movd [r0 + 268], m4
9455
9456 ; mode 19
9457
9458 palignr m4, m1, 1
9459 punpcklbw m1, m4
9460
9461 pmaddubsw m5, m1, [r5 + 6 * 16]
9462 pmulhrsw m5, m0
9463 packuswb m5, m5
9464 movd [r0 + 272], m5
9465
9466 pslldq m2, m1, 2
9467 pinsrb m2, [r2 + 1], 0
9468 pinsrb m2, [r2], 1
9469
9470 pmaddubsw m3, m2, [r5 + 12 * 16]
9471 pmulhrsw m3, m0
9472 packuswb m3, m3
9473 movd [r0 + 276], m3
9474
9475 pslldq m3, m2, 2
9476 pinsrb m3, [r2 + 1], 1
9477 pinsrb m3, [r2 + 2], 0
9478
9479 pmaddubsw m4, m3, [r5 + 18 * 16]
9480 pmulhrsw m4, m0
9481 packuswb m4, m4
9482 movd [r0 + 280], m4
9483
9484 pslldq m3, 2
9485 pinsrb m3, [r2 + 2], 1
9486 pinsrb m3, [r2 + 4], 0
9487
9488 pmaddubsw m3, [r5 + 24 * 16]
9489 pmulhrsw m3, m0
9490 packuswb m3, m3
9491 movd [r0 + 284], m3
9492
9493 ; mode 20
9494
9495 pmaddubsw m3, m1, [r5 + 11 * 16]
9496 pmulhrsw m3, m0
9497 packuswb m3, m3
9498 movd [r0 + 288], m3
9499
9500 pinsrb m2, [r2 + 2], 0
9501
9502 pmaddubsw m3, m2, [r5 + 22 * 16]
9503 pmulhrsw m3, m0
9504 packuswb m3, m3
9505 movd [r0 + 292], m3
9506
9507 pmaddubsw m3, m2, [r5 + 1 * 16]
9508 pmulhrsw m3, m0
9509 packuswb m3, m3
9510 movd [r0 + 296], m3
9511
9512 pslldq m3, m2, 2
9513 pinsrb m3, [r2 + 2], 1
9514 pinsrb m3, [r2 + 3], 0
9515
9516 pmaddubsw m4, m3, [r5 + 12 * 16]
9517 pmulhrsw m4, m0
9518 packuswb m4, m4
9519 movd [r0 + 300], m4
9520
9521 ; mode 21
9522
9523 pmaddubsw m4, m1, [r5 + 15 * 16]
9524 pmulhrsw m4, m0
9525 packuswb m4, m4
9526 movd [r0 + 304], m4
9527
9528 pmaddubsw m4, m2, [r5 + 30 * 16]
9529 pmulhrsw m4, m0
9530 packuswb m4, m4
9531 movd [r0 + 308], m4
9532
9533 pmaddubsw m4, m2, [r5 + 13 * 16]
9534 pmulhrsw m4, m0
9535 packuswb m4, m4
9536 movd [r0 + 312], m4
9537
9538 pinsrb m3, [r2 + 4], 0
9539
9540 pmaddubsw m3, [r5 + 28 * 16]
9541 pmulhrsw m3, m0
9542 packuswb m3, m3
9543 movd [r0 + 316], m3
9544
9545 ; mode 22
9546
9547 pmaddubsw m3, m1, [r5 + 19 * 16]
9548 pmulhrsw m3, m0
9549 packuswb m3, m3
9550 movd [r0 + 320], m3
9551
9552 movd [r0 + 324], m5
9553
9554 pmaddubsw m3, m2, [r5 + 25 * 16]
9555 pmulhrsw m3, m0
9556 packuswb m3, m3
9557 movd [r0 + 328], m3
9558
9559 pmaddubsw m3, m2, [r5 + 12 * 16]
9560 pmulhrsw m3, m0
9561 packuswb m3, m3
9562 movd [r0 + 332], m3
9563
9564 ; mode 23
9565
9566 pmaddubsw m3, m1, [r5 + 23 * 16]
9567 pmulhrsw m3, m0
9568 packuswb m3, m3
9569 movd [r0 + 336], m3
9570
9571 pmaddubsw m3, m1, [r5 + 14 * 16]
9572 pmulhrsw m3, m0
9573 packuswb m3, m3
9574 movd [r0 + 340], m3
9575
9576 pmaddubsw m3, m1, [r5 + 5 * 16]
9577 pmulhrsw m3, m0
9578 packuswb m3, m3
9579 movd [r0 + 344], m3
9580
9581 pinsrb m2, [r2 + 4], 0
9582
9583 pmaddubsw m2, [r5 + 28 * 16]
9584 pmulhrsw m2, m0
9585 packuswb m2, m2
9586 movd [r0 + 348], m2
9587
9588 ; mode 24
9589
9590 pmaddubsw m2, m1, [r5 + 27 * 16]
9591 pmulhrsw m2, m0
9592 packuswb m2, m2
9593 movd [r0 + 352], m2
9594
9595 pmaddubsw m2, m1, [r5 + 22 * 16]
9596 pmulhrsw m2, m0
9597 packuswb m2, m2
9598 movd [r0 + 356], m2
9599
9600 pmaddubsw m2, m1, [r5 + 17 * 16]
9601 pmulhrsw m2, m0
9602 packuswb m2, m2
9603 movd [r0 + 360], m2
9604
9605 pmaddubsw m2, m1, [r5 + 12 * 16]
9606 pmulhrsw m2, m0
9607 packuswb m2, m2
9608 movd [r0 + 364], m2
9609
9610 ; mode 25
9611
9612 pmaddubsw m2, m1, [r5 + 30 * 16]
9613 pmulhrsw m2, m0
9614 packuswb m2, m2
9615 movd [r0 + 368], m2
9616
9617 pmaddubsw m2, m1, [r5 + 28 * 16]
9618 pmulhrsw m2, m0
9619 packuswb m2, m2
9620 movd [r0 + 372], m2
9621
9622 pmaddubsw m2, m1, [r5 + 26 * 16]
9623 pmulhrsw m2, m0
9624 packuswb m2, m2
9625 movd [r0 + 376], m2
9626
9627 pmaddubsw m2, m1, [r5 + 24 * 16]
9628 pmulhrsw m2, m0
9629 packuswb m2, m2
9630 movd [r0 + 380], m2
9631
9632 ; mode 26
9633
9634 movh m1, [r1 + 1]
9635 pshufd m2, m1, 0
9636 movu [r0 + 384], m2
9637
9638 pxor m2, m2
9639
9640 pshufb m3, m1, m2
9641 punpcklbw m3, m2
9642
9643 movh m4, [r2]
9644
9645 pshufb m5, m4, m2
9646 punpcklbw m5, m2
9647
9648 psrldq m4, 1
9649 punpcklbw m4, m2
9650
9651 psubw m4, m5
9652 psraw m4, 1
9653
9654 paddw m3, m4
9655
9656 packuswb m3, m2
9657
9658 pextrb [r0 + 384], m3, 0
9659 pextrb [r0 + 388], m3, 1
9660 pextrb [r0 + 392], m3, 2
9661 pextrb [r0 + 396], m3, 3
9662
9663 ; mode 27
9664
9665 palignr m2, m1, 1
9666 punpcklbw m1, m2
9667
9668 pmaddubsw m2, m1, [r5 + 2 * 16]
9669 pmulhrsw m2, m0
9670 packuswb m2, m2
9671 movd [r0 + 400], m2
9672
9673 pmaddubsw m2, m1, [r5 + 4 * 16]
9674 pmulhrsw m2, m0
9675 packuswb m2, m2
9676 movd [r0 + 404], m2
9677
9678 pmaddubsw m2, m1, [r5 + 6 * 16]
9679 pmulhrsw m2, m0
9680 packuswb m2, m2
9681 movd [r0 + 408], m2
9682
9683 pmaddubsw m2, m1, [r5 + 8 * 16]
9684 pmulhrsw m2, m0
9685 packuswb m2, m2
9686 movd [r0 + 412], m2
9687
9688 ; mode 28
9689
9690 pmaddubsw m2, m1, [r5 + 5 * 16]
9691 pmulhrsw m2, m0
9692 packuswb m2, m2
9693 movd [r0 + 416], m2
9694
9695 pmaddubsw m2, m1, [r5 + 10 * 16]
9696 pmulhrsw m2, m0
9697 packuswb m2, m2
9698 movd [r0 + 420], m2
9699
9700 pmaddubsw m2, m1, [r5 + 15 * 16]
9701 pmulhrsw m2, m0
9702 packuswb m2, m2
9703 movd [r0 + 424], m2
9704
9705 pmaddubsw m2, m1, m7
9706 pmulhrsw m2, m0
9707 packuswb m2, m2
9708 movd [r0 + 428], m2
9709
9710 ; mode 29
9711
9712 pmaddubsw m2, m1, [r5 + 9 * 16]
9713 pmulhrsw m2, m0
9714 packuswb m2, m2
9715 movd [r0 + 432], m2
9716
9717 pmaddubsw m2, m1, [r5 + 18 * 16]
9718 pmulhrsw m2, m0
9719 packuswb m2, m2
9720 movd [r0 + 436], m2
9721
9722 pmaddubsw m2, m1, [r5 + 27 * 16]
9723 pmulhrsw m2, m0
9724 packuswb m2, m2
9725 movd [r0 + 440], m2
9726
9727 palignr m2, m1, 2
9728
9729 pmaddubsw m3, m2, [r5 + 4 * 16]
9730 pmulhrsw m3, m0
9731 packuswb m3, m3
9732 movd [r0 + 444], m3
9733
9734 ; mode 30
9735
9736 pmaddubsw m3, m1, [r5 + 13 * 16]
9737 pmulhrsw m3, m0
9738 packuswb m3, m3
9739 movd [r0 + 448], m3
9740
9741 pmaddubsw m6, m1, [r5 + 26 * 16]
9742 pmulhrsw m6, m0
9743 packuswb m6, m6
9744 movd [r0 + 452], m6
9745
9746 pmaddubsw m3, m2, [r5 + 7 * 16]
9747 pmulhrsw m3, m0
9748 packuswb m3, m3
9749 movd [r0 + 456], m3
9750
9751 pmaddubsw m5, m2, m7
9752 pmulhrsw m5, m0
9753 packuswb m5, m5
9754 movd [r0 + 460], m5
9755
9756 ; mode 31
9757
9758 pmaddubsw m3, m1, [r5 + 17 * 16]
9759 pmulhrsw m3, m0
9760 packuswb m3, m3
9761 movd [r0 + 464], m3
9762
9763 pmaddubsw m3, m2, [r5 + 2 * 16]
9764 pmulhrsw m3, m0
9765 packuswb m3, m3
9766 movd [r0 + 468], m3
9767
9768 pmaddubsw m3, m2, [r5 + 19 * 16]
9769 pmulhrsw m3, m0
9770 packuswb m3, m3
9771 movd [r0 + 472], m3
9772
9773 palignr m3, m2, 2
9774
9775 pmaddubsw m4, m3, [r5 + 4 * 16]
9776 pmulhrsw m4, m0
9777 packuswb m4, m4
9778 movd [r0 + 476], m4
9779
9780 ; mode 32
9781
9782 pmaddubsw m4, m1, [r5 + 21 * 16]
9783 pmulhrsw m4, m0
9784 packuswb m4, m4
9785 movd [r0 + 480], m4
9786
9787 pmaddubsw m4, m2, [r5 + 10 * 16]
9788 pmulhrsw m4, m0
9789 packuswb m4, m4
9790 movd [r0 + 484], m4
9791
9792 pmaddubsw m4, m2, [r5 + 31 * 16]
9793 pmulhrsw m4, m0
9794 packuswb m4, m4
9795 movd [r0 + 488], m4
9796
9797 pmaddubsw m4, m3, m7
9798 pmulhrsw m4, m0
9799 packuswb m4, m4
9800 movd [r0 + 492], m4
9801
9802 ; mode 33
9803
9804 movd [r0 + 496], m6
9805
9806 movd [r0 + 500], m5
9807
9808 pmaddubsw m4, m3, [r5 + 14 * 16]
9809 pmulhrsw m4, m0
9810 packuswb m4, m4
9811 movd [r0 + 504], m4
9812
9813 psrldq m3, 2
9814
9815 pmaddubsw m3, [r5 + 8 * 16]
9816 pmulhrsw m3, m0
9817 packuswb m3, m3
9818 movd [r0 + 508], m3
9819
9820 ; mode 34
9821
9822 movh m0, [r1 + 2]
9823 movd [r0 + 512], m0
9824
9825 palignr m1, m0, 1
9826 movd [r0 + 516], m1
9827
9828 palignr m1, m0, 2
9829 movd [r0 + 520], m1
9830
9831 palignr m1, m0, 3
9832 movd [r0 + 524], m1
9833
9834 RET
9835
9836 ;-----------------------------------------------------------------------------
9837 ; void all_angs_pred_8x8(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
9838 ;-----------------------------------------------------------------------------
9839 INIT_XMM sse4
9840 cglobal all_angs_pred_8x8, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
9841
9842 ; mode 2
9843
9844 movu m0, [r4 + 2]
9845
9846 palignr m1, m0, 1
9847 punpcklqdq m2, m0, m1
9848 movu [r0], m2
9849
9850 palignr m1, m0, 2
9851 palignr m2, m0, 3
9852 punpcklqdq m1, m2
9853 movu [r0 + 16], m1
9854
9855 palignr m1, m0, 4
9856 palignr m2, m0, 5
9857 punpcklqdq m1, m2
9858 movu [r0 + 32], m1
9859
9860 palignr m1, m0, 6
9861 palignr m2, m0, 7
9862 punpcklqdq m1, m2
9863 movu [r0 + 48], m1
9864
9865 ; mode 3 [row 0, 1]
9866
9867 mova m7, [pw_1024]
9868 lea r5, [ang_table]
9869
9870 movu m0, [r2 + 1]
9871
9872 palignr m1, m0, 1
9873 palignr m2, m0, 2
9874
9875 punpcklbw m3, m0, m1
9876 pmaddubsw m4, m3, [r5 + 26 * 16]
9877 pmulhrsw m4, m7
9878
9879 punpcklbw m1, m2
9880 pmaddubsw m5, m1, [r5 + 20 * 16]
9881 pmulhrsw m5, m7
9882
9883 packuswb m4, m5
9884
9885 movu [r0 + 64], m4
9886
9887 ; mode 6 [row 1]
9888
9889 movh [r0 + 264], m4
9890
9891 ; mode 6 [row 3]
9892
9893 movhps [r0 + 280], m4
9894
9895 ; mode 4 [row 0, 1]
9896
9897 pmaddubsw m4, m3, [r5 + 21 * 16]
9898 pmulhrsw m4, m7
9899
9900 pmaddubsw m5, m1, [r5 + 10 * 16]
9901 pmulhrsw m5, m7
9902
9903 packuswb m4, m5
9904 movu [r0 + 128], m4
9905
9906 ; mode 5 [row 0, 1]
9907
9908 pmaddubsw m4, m3, [r5 + 17 * 16]
9909 pmulhrsw m4, m7
9910
9911 pmaddubsw m5, m1, [r5 + 2 * 16]
9912 pmulhrsw m5, m7
9913
9914 packuswb m4, m5
9915 movu [r0 + 192], m4
9916
9917 ; mode 6 [row 0]
9918
9919 pmaddubsw m4, m3, [r5 + 13 * 16]
9920 pmulhrsw m4, m7
9921
9922 pxor m5, m5
9923
9924 packuswb m4, m5
9925 movh [r0 + 256], m4
9926
9927 ; mode 7 [row 0, 1]
9928
9929 pmaddubsw m4, m3, [r5 + 9 * 16]
9930 pmulhrsw m4, m7
9931
9932 pmaddubsw m5, m3, [r5 + 18 * 16]
9933 pmulhrsw m5, m7
9934
9935 packuswb m4, m5
9936 movu [r0 + 320], m4
9937
9938 ; mode 8 [row 0, 1]
9939
9940 pmaddubsw m4, m3, [r5 + 5 * 16]
9941 pmulhrsw m4, m7
9942
9943 pmaddubsw m5, m3, [r5 + 10 * 16]
9944 pmulhrsw m5, m7
9945
9946 packuswb m4, m5
9947 movu [r0 + 384], m4
9948
9949 ; mode 8 [row 2, 3]
9950
9951 pmaddubsw m4, m3, [r5 + 15 * 16]
9952 pmulhrsw m4, m7
9953
9954 pmaddubsw m5, m3, [r5 + 20 * 16]
9955 pmulhrsw m5, m7
9956
9957 packuswb m4, m5
9958 movu [r0 + 400], m4
9959
9960 ; mode 8 [row 4, 5]
9961
9962 pmaddubsw m4, m3, [r5 + 25 * 16]
9963 pmulhrsw m4, m7
9964
9965 pmaddubsw m5, m3, [r5 + 30 * 16]
9966 pmulhrsw m5, m7
9967
9968 packuswb m4, m5
9969 movu [r0 + 416], m4
9970
9971 ; mode 8 [row 6, 7]
9972
9973 pmaddubsw m4, m1, [r5 + 3 * 16]
9974 pmulhrsw m4, m7
9975
9976 pmaddubsw m5, m1, [r5 + 8 * 16]
9977 pmulhrsw m5, m7
9978
9979 packuswb m4, m5
9980 movu [r0 + 432], m4
9981
9982 ; mode 9 [row 0, 1]
9983
9984 pmaddubsw m4, m3, [r5 + 2 * 16]
9985 pmulhrsw m4, m7
9986
9987 pmaddubsw m5, m3, [r5 + 4 * 16]
9988 pmulhrsw m5, m7
9989
9990 packuswb m4, m5
9991 movu [r0 + 448], m4
9992
9993 ; mode 9 [row 2, 3]
9994
9995 pmaddubsw m4, m3, [r5 + 6 * 16]
9996 pmulhrsw m4, m7
9997
9998 pmaddubsw m5, m3, [r5 + 8 * 16]
9999 pmulhrsw m5, m7
10000
10001 packuswb m4, m5
10002 movu [r0 + 464], m4
10003
10004 ; mode 9 [row 4, 5]
10005
10006 pmaddubsw m4, m3, [r5 + 10 * 16]
10007 pmulhrsw m4, m7
10008
10009 pmaddubsw m5, m3, [r5 + 12 * 16]
10010 pmulhrsw m5, m7
10011
10012 packuswb m4, m5
10013 movu [r0 + 480], m4
10014
10015 ; mode 9 [row 6, 7]
10016
10017 pmaddubsw m4, m3, [r5 + 14 * 16]
10018 pmulhrsw m4, m7
10019
10020 pmaddubsw m5, m3, [r5 + 16 * 16]
10021 pmulhrsw m5, m7
10022
10023 packuswb m4, m5
10024 movu [r0 + 496], m4
10025
10026 ; mode 7 [row 2, 3]
10027
10028 pmaddubsw m4, m3, [r5 + 27 * 16]
10029 pmulhrsw m4, m7
10030
10031 pmaddubsw m5, m1, [r5 + 4 * 16]
10032 pmulhrsw m5, m7
10033
10034 packuswb m4, m5
10035 movu [r0 + 336], m4
10036
10037 ; mode 7 [row 4, 5]
10038
10039 pmaddubsw m4, m1, [r5 + 13 * 16]
10040 pmulhrsw m4, m7
10041
10042 pmaddubsw m5, m1, [r5 + 22 * 16]
10043 pmulhrsw m5, m7
10044
10045 packuswb m4, m5
10046 movu [r0 + 352], m4
10047
10048 ; mode 6 [row 2]
10049
10050 pmaddubsw m4, m1, [r5 + 7 * 16]
10051 pmulhrsw m4, m7
10052
10053 pxor m5, m5
10054
10055 packuswb m4, m5
10056 movh [r0 + 272], m4
10057
10058 ; mode 3 [row 2, 3]
10059
10060 palignr m1, m0, 3
10061 palignr m3, m0, 4
10062
10063 punpcklbw m2, m1
10064 pmaddubsw m5, m2, [r5 + 14 * 16]
10065 pmulhrsw m5, m7
10066
10067 punpcklbw m1, m3
10068 pmaddubsw m6, m1, [r5 + 8 * 16]
10069 pmulhrsw m6, m7
10070
10071 packuswb m5, m6
10072 movu [r0 + 80], m5
10073
10074 ; mode 6 [row 7]
10075
10076 movhps [r0 + 312], m5
10077
10078 ; mode 6 [row 5]
10079
10080 movh [r0 + 296], m5
10081
10082 ; mode 4 [calculate and store row 4, 5]
10083
10084 pmaddubsw m4, m1, [r5 + 9 * 16]
10085 pmulhrsw m4, m7
10086
10087 pmaddubsw m5, m1, [r5 + 30 * 16]
10088 pmulhrsw m5, m7
10089
10090 packuswb m4, m5
10091 movu [r0 + 160], m4
10092
10093 ; mode 5 [row 4, 5]
10094
10095 pmaddubsw m4, m2, [r5 + 21 * 16]
10096 pmulhrsw m4, m7
10097
10098 pmaddubsw m5, m1, [r5 + 6 * 16]
10099 pmulhrsw m5, m7
10100
10101 packuswb m4, m5
10102 movu [r0 + 224], m4
10103
10104 ; mode 6 [row 4, 5]
10105
10106 pmaddubsw m5, m2, [r5 + 1 * 16]
10107 pmulhrsw m5, m7
10108
10109 pxor m6, m6
10110
10111 packuswb m5, m6
10112 movh [r0 + 288], m5
10113
10114 ; mode 6 [row 6, 7]
10115
10116 pmaddubsw m5, m2, [r5 + 27 * 16]
10117 pmulhrsw m5, m7
10118
10119 pxor m6, m6
10120
10121 packuswb m5, m6
10122 movh [r0 + 304], m5
10123
10124 ; mode 5 [calculate row 6]
10125
10126 pmaddubsw m6, m1, [r5 + 23 * 16]
10127 pmulhrsw m6, m7
10128
10129 ; mode 3 [row 4, 5]
10130
10131 palignr m1, m0, 5
10132
10133 punpcklbw m3, m1
10134 pmaddubsw m4, m3, [r5 + 2 * 16]
10135 pmulhrsw m4, m7
10136
10137 pmaddubsw m5, m3, [r5 + 28 * 16]
10138 pmulhrsw m5, m7
10139
10140 packuswb m4, m5
10141 movu [r0 + 96], m4
10142
10143 ; mode 4 [calculate row 7]
10144
10145 pmaddubsw m5, m3, [r5 + 19 * 16]
10146 pmulhrsw m5, m7
10147
10148 ; mode 5 [calculate row 6]
10149
10150 pmaddubsw m4, m3, [r5 + 8 * 16]
10151 pmulhrsw m4, m7
10152
10153 packuswb m6, m4
10154 movu [r0 + 240], m6
10155
10156 ; mode 3 [row 6, 7]
10157
10158 palignr m2, m0, 6
10159 palignr m3, m0, 7
10160
10161 punpcklbw m1, m2
10162 pmaddubsw m4, m1, [r5 + 22 * 16]
10163 pmulhrsw m4, m7
10164
10165 punpcklbw m2, m3
10166 pmaddubsw m2, [r5 + 16 * 16]
10167 pmulhrsw m2, m7
10168
10169 packuswb m4, m2
10170 movu [r0 + 112], m4
10171
10172 ; mode 4 [calculate row 7]
10173
10174 pmaddubsw m2, m1, [r5 + 8 * 16]
10175 pmulhrsw m2, m7
10176
10177 ; mode 4 [store row 6 and 7]
10178
10179 packuswb m5, m2
10180 movu [r0 + 176], m5
10181
10182 ; mode 4 [row 2, 3]
10183
10184 palignr m1, m0, 1
10185 palignr m2, m0, 2
10186 palignr m3, m0, 3
10187
10188 punpcklbw m1, m2
10189 pmaddubsw m4, m1, [r5 + 31 * 16]
10190 pmulhrsw m4, m7
10191
10192 punpcklbw m2, m3
10193 pmaddubsw m5, m2, [r5 + 20 * 16]
10194 pmulhrsw m5, m7
10195
10196 packuswb m4, m5
10197 movu [r0 + 144], m4
10198
10199 ; mode 5 [row 2, 3]
10200
10201 pmaddubsw m4, m1, [r5 + 19 * 16]
10202 pmulhrsw m4, m7
10203
10204 pmaddubsw m5, m2, [r5 + 4 * 16]
10205 pmulhrsw m5, m7
10206
10207 packuswb m4, m5
10208 movu [r0 + 208], m4
10209
10210 ; mode 7 [row 6, 7]
10211
10212 pmaddubsw m4, m1, [r5 + 31 * 16]
10213 pmulhrsw m4, m7
10214
10215 pmaddubsw m5, m2, [r5 + 8 * 16]
10216 pmulhrsw m5, m7
10217
10218 packuswb m4, m5
10219 movu [r0 + 368], m4
10220
10221 ; mode 10
10222
10223 pshufb m1, m0, [tab_Si]
10224 movu [r0 + 512], m1
10225 movu [r0 + 528], m1
10226 movu [r0 + 544], m1
10227 movu [r0 + 560], m1
10228
10229 pxor m0, m0
10230
10231 pshufb m1, m1, m0
10232 punpcklbw m1, m0
10233
10234 movu m2, [r1]
10235
10236 pshufb m3, m2, m0
10237 punpcklbw m3, m0
10238
10239 psrldq m4, m2, 1
10240 punpcklbw m4, m0
10241
10242 movu m2, [r1 + 9]
10243 punpcklbw m2, m0
10244
10245 psubw m4, m3
10246 psubw m2, m3
10247
10248 psraw m4, 1
10249 psraw m2, 1
10250
10251 paddw m4, m1
10252 paddw m2, m1
10253
10254 packuswb m4, m2
10255
10256 pextrb [r0 + 512], m4, 0
10257 pextrb [r0 + 520], m4, 1
10258 pextrb [r0 + 528], m4, 2
10259 pextrb [r0 + 536], m4, 3
10260 pextrb [r0 + 544], m4, 4
10261 pextrb [r0 + 552], m4, 5
10262 pextrb [r0 + 560], m4, 6
10263 pextrb [r0 + 568], m4, 7
10264
10265 ; mode 11 [row 0, 1]
10266
10267 movu m0, [r2]
10268 palignr m1, m0, 1
10269 punpcklbw m2, m0, m1
10270
10271 pmaddubsw m3, m2, [r5 + 30 * 16]
10272 pmulhrsw m3, m7
10273
10274 pmaddubsw m4, m2, [r5 + 28 * 16]
10275 pmulhrsw m4, m7
10276
10277 packuswb m3, m4
10278 movu [r0 + 576], m3
10279
10280 ; mode 11 [row 2, 3]
10281
10282 pmaddubsw m3, m2, [r5 + 26 * 16]
10283 pmulhrsw m3, m7
10284
10285 pmaddubsw m4, m2, [r5 + 24 * 16]
10286 pmulhrsw m4, m7
10287
10288 packuswb m3, m4
10289 movu [r0 + 592], m3
10290
10291 ; mode 11 [row 4, 5]
10292
10293 pmaddubsw m3, m2, [r5 + 22 * 16]
10294 pmulhrsw m3, m7
10295
10296 pmaddubsw m4, m2, [r5 + 20 * 16]
10297 pmulhrsw m4, m7
10298
10299 packuswb m5, m3, m4
10300 movu [r0 + 608], m5
10301
10302 ; mode 12 [row 0, 1]
10303
10304 pmaddubsw m4, m2, [r5 + 27 * 16]
10305 pmulhrsw m4, m7
10306
10307 packuswb m4, m3
10308 movu [r0 + 640], m4
10309
10310 ; mode 11 [row 6, 7]
10311
10312 pmaddubsw m3, m2, [r5 + 18 * 16]
10313 pmulhrsw m3, m7
10314
10315 pmaddubsw m4, m2, [r5 + 16 * 16]
10316 pmulhrsw m4, m7
10317
10318 packuswb m3, m4
10319 movu [r0 + 624], m3
10320
10321 ; mode 12 [row 2, 3]
10322
10323 pmaddubsw m3, m2, [r5 + 17 * 16]
10324 pmulhrsw m3, m7
10325
10326 pmaddubsw m4, m2, [r5 + 12 * 16]
10327 pmulhrsw m4, m7
10328
10329 packuswb m3, m4
10330 movu [r0 + 656], m3
10331
10332 ; mode 12 [row 4, 5]
10333
10334 pmaddubsw m3, m2, [r5 + 7 * 16]
10335 pmulhrsw m3, m7
10336
10337 pmaddubsw m4, m2, [r5 + 2 * 16]
10338 pmulhrsw m4, m7
10339
10340 packuswb m3, m4
10341 movu [r0 + 672], m3
10342
10343 ; mode 12 [row 6, 7]
10344
10345 pslldq m3, m2, 2
10346 pinsrb m3, [r1 + 0], 1
10347 pinsrb m3, [r1 + 6], 0
10348
10349 pmaddubsw m4, m3, [r5 + 29 * 16]
10350 pmulhrsw m4, m7
10351
10352 pmaddubsw m5, m3, [r5 + 24 * 16]
10353 pmulhrsw m5, m7
10354
10355 packuswb m4, m5
10356 movu [r0 + 688], m4
10357
10358 ; mode 13 [row 0, 1]
10359
10360 pmaddubsw m4, m2, [r5 + 23 * 16]
10361 pmulhrsw m4, m7
10362
10363 pmaddubsw m5, m2, [r5 + 14 * 16]
10364 pmulhrsw m5, m7
10365
10366 packuswb m4, m5
10367 movu [r0 + 704], m4
10368
10369 ; mode 13 [row 2, 3]
10370
10371 pmaddubsw m4, m2, [r5 + 5 * 16]
10372 pmulhrsw m4, m7
10373
10374 pinsrb m3, [r1 + 4], 0
10375 pmaddubsw m5, m3, [r5 + 28 * 16]
10376 pmulhrsw m5, m7
10377
10378 packuswb m4, m5
10379 movu [r0 + 720], m4
10380
10381 ; mode 13 [row 4, 5]
10382
10383 pmaddubsw m4, m3, [r5 + 19 * 16]
10384 pmulhrsw m4, m7
10385
10386 pmaddubsw m5, m3, [r5 + 10 * 16]
10387 pmulhrsw m5, m7
10388
10389 packuswb m4, m5
10390 movu [r0 + 736], m4
10391
10392 ; mode 13 [row 6, 7]
10393
10394 pmaddubsw m4, m3, [r5 + 1 * 16]
10395 pmulhrsw m4, m7
10396
10397 pslldq m5, m3, 2
10398 pinsrb m5, [r1 + 4], 1
10399 pinsrb m5, [r1 + 7], 0
10400
10401 pmaddubsw m5, [r5 + 24 * 16]
10402 pmulhrsw m5, m7
10403
10404 packuswb m4, m5
10405 movu [r0 + 752], m4
10406
10407 ; mode 14 [row 0, 1]
10408
10409 pmaddubsw m4, m2, [r5 + 19 * 16]
10410 pmulhrsw m4, m7
10411
10412 pmaddubsw m5, m2, [r5 + 6 * 16]
10413 pmulhrsw m5, m7
10414
10415 packuswb m4, m5
10416 movu [r0 + 768], m4
10417
10418 ; mode 14 [row 2, 3]
10419
10420 pinsrb m3, [r1 + 2], 0
10421
10422 pmaddubsw m4, m3, [r5 + 25 * 16]
10423 pmulhrsw m4, m7
10424
10425 pmaddubsw m5, m3, [r5 + 12 * 16]
10426 pmulhrsw m5, m7
10427
10428 packuswb m4, m5
10429 movu [r0 + 784], m4
10430
10431 ; mode 14 [row 4, 5]
10432
10433 pslldq m1, m3, 2
10434 pinsrb m1, [r1 + 2], 1
10435 pinsrb m1, [r1 + 5], 0
10436
10437 pmaddubsw m4, m1, [r5 + 31 * 16]
10438 pmulhrsw m4, m7
10439
10440 pmaddubsw m5, m1, [r5 + 18 * 16]
10441 pmulhrsw m5, m7
10442
10443 packuswb m4, m5
10444 movu [r0 + 800], m4
10445
10446 ; mode 14 [row 6, 7]
10447
10448 pmaddubsw m4, m1, [r5 + 5 * 16]
10449 pmulhrsw m4, m7
10450
10451 pslldq m1, 2
10452 pinsrb m1, [r1 + 5], 1
10453 pinsrb m1, [r1 + 7], 0
10454
10455 pmaddubsw m5, m1, [r5 + 24 * 16]
10456 pmulhrsw m5, m7
10457
10458 packuswb m4, m5
10459 movu [r0 + 816], m4
10460
10461 ; mode 15 [row 0, 1]
10462
10463 pmaddubsw m4, m2, [r5 + 15 * 16]
10464 pmulhrsw m4, m7
10465
10466 pmaddubsw m5, m3, [r5 + 30 * 16]
10467 pmulhrsw m5, m7
10468
10469 packuswb m4, m5
10470 movu [r0 + 832], m4
10471
10472 ; mode 15 [row 2, 3]
10473
10474 pmaddubsw m4, m3, [r5 + 13 * 16]
10475 pmulhrsw m4, m7
10476
10477 pslldq m1, m3, 2
10478 pinsrb m1, [r1 + 2], 1
10479 pinsrb m1, [r1 + 4], 0
10480
10481 pmaddubsw m5, m1, [r5 + 28 * 16]
10482 pmulhrsw m5, m7
10483
10484 packuswb m4, m5
10485 movu [r0 + 848], m4
10486
10487 ; mode 15 [row 4, 5]
10488
10489 pmaddubsw m4, m1, [r5 + 11 * 16]
10490 pmulhrsw m4, m7
10491
10492 pslldq m1, 2
10493 pinsrb m1, [r1 + 4], 1
10494 pinsrb m1, [r1 + 6], 0
10495
10496 pmaddubsw m5, m1, [r5 + 26 * 16]
10497 pmulhrsw m5, m7
10498
10499 packuswb m4, m5
10500 movu [r0 + 864], m4
10501
10502 ; mode 15 [row 6, 7]
10503
10504 pmaddubsw m4, m1, [r5 + 9 * 16]
10505 pmulhrsw m4, m7
10506
10507 pslldq m1, 2
10508 pinsrb m1, [r1 + 6], 1
10509 pinsrb m1, [r1 + 8], 0
10510
10511 pmaddubsw m1, [r5 + 24 * 16]
10512 pmulhrsw m1, m7
10513
10514 packuswb m4, m1
10515 movu [r0 + 880], m4
10516
10517 ; mode 16 [row 0, 1]
10518
10519 pmaddubsw m4, m2, [r5 + 11 * 16]
10520 pmulhrsw m4, m7
10521
10522 pmaddubsw m5, m3, [r5 + 22 * 16]
10523 pmulhrsw m5, m7
10524
10525 packuswb m4, m5
10526 movu [r0 + 896], m4
10527
10528 ; mode 16 [row 2, 3]
10529
10530 pmaddubsw m4, m3, [r5 + 1 * 16]
10531 pmulhrsw m4, m7
10532
10533 pslldq m3, 2
10534 pinsrb m3, [r1 + 2], 1
10535 pinsrb m3, [r1 + 3], 0
10536
10537 pmaddubsw m5, m3, [r5 + 12 * 16]
10538 pmulhrsw m5, m7
10539
10540 packuswb m4, m5
10541 movu [r0 + 912], m4
10542
10543 ; mode 16 [row 4, 5]
10544
10545 pslldq m3, 2
10546 pinsrb m3, [r1 + 3], 1
10547 pinsrb m3, [r1 + 5], 0
10548
10549 pmaddubsw m4, m3, [r5 + 23 * 16]
10550 pmulhrsw m4, m7
10551
10552 pmaddubsw m5, m3, [r5 + 2 * 16]
10553 pmulhrsw m5, m7
10554
10555 packuswb m4, m5
10556 movu [r0 + 928], m4
10557
10558 ; mode 16 [row 6, 7]
10559
10560 pslldq m3, 2
10561 pinsrb m3, [r1 + 5], 1
10562 pinsrb m3, [r1 + 6], 0
10563
10564 pmaddubsw m4, m3, [r5 + 13 * 16]
10565 pmulhrsw m4, m7
10566
10567 pslldq m3, 2
10568 pinsrb m3, [r1 + 6], 1
10569 pinsrb m3, [r1 + 8], 0
10570
10571 pmaddubsw m3, [r5 + 24 * 16]
10572 pmulhrsw m3, m7
10573
10574 packuswb m4, m3
10575 movu [r0 + 944], m4
10576
10577 ; mode 17 [row 0, 1]
10578
10579 pmaddubsw m4, m2, [r5 + 6 * 16]
10580 pmulhrsw m4, m7
10581
10582 pslldq m2, 2
10583 pinsrb m2, [r1 + 0], 1
10584 pinsrb m2, [r1 + 1], 0
10585
10586 pmaddubsw m3, m2, [r5 + 12 * 16]
10587 pmulhrsw m3, m7
10588
10589 packuswb m4, m3
10590 movu [r0 + 960], m4
10591
10592 ; mode 17 [row 2, 3]
10593
10594 pslldq m2, 2
10595 pinsrb m2, [r1 + 1], 1
10596 pinsrb m2, [r1 + 2], 0
10597
10598 pmaddubsw m4, m2, [r5 + 18 * 16]
10599 pmulhrsw m4, m7
10600
10601 pslldq m2, 2
10602 pinsrb m2, [r1 + 2], 1
10603 pinsrb m2, [r1 + 4], 0
10604
10605 pmaddubsw m3, m2, [r5 + 24 * 16]
10606 pmulhrsw m3, m7
10607
10608 packuswb m4, m3
10609 movu [r0 + 976], m4
10610
10611 ; mode 17 [row 4, 5]
10612
10613 pslldq m2, 2
10614 pinsrb m2, [r1 + 4], 1
10615 pinsrb m2, [r1 + 5], 0
10616
10617 pmaddubsw m4, m2, [r5 + 30 * 16]
10618 pmulhrsw m4, m7
10619
10620 pmaddubsw m3, m2, [r5 + 4 * 16]
10621 pmulhrsw m3, m7
10622
10623 packuswb m4, m3
10624 movu [r0 + 992], m4
10625
10626 ; mode 17 [row 6, 7]
10627
10628 pslldq m2, 2
10629 pinsrb m2, [r1 + 5], 1
10630 pinsrb m2, [r1 + 6], 0
10631
10632 pmaddubsw m4, m2, [r5 + 10 * 16]
10633 pmulhrsw m4, m7
10634
10635 pslldq m2, 2
10636 pinsrb m2, [r1 + 6], 1
10637 pinsrb m2, [r1 + 7], 0
10638
10639 pmaddubsw m3, m2, [r5 + 16 * 16]
10640 pmulhrsw m3, m7
10641
10642 packuswb m4, m3
10643 movu [r0 + 1008], m4
10644
10645 ; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7]
10646
10647 movh m1, [r3]
10648 movh [r0 + 1024], m1
10649
10650 pslldq m2, m1, 1
10651 pinsrb m2, [r4 + 1], 0
10652 movh [r0 + 1032], m2
10653
10654 pslldq m2, 1
10655 pinsrb m2, [r4 + 2], 0
10656 movh [r0 + 1040], m2
10657
10658 pslldq m2, 1
10659 pinsrb m2, [r4 + 3], 0
10660 movh [r0 + 1048], m2
10661
10662 pslldq m2, 1
10663 pinsrb m2, [r4 + 4], 0
10664 movh [r0 + 1056], m2
10665
10666 pslldq m2, 1
10667 pinsrb m2, [r4 + 5], 0
10668 movh [r0 + 1064], m2
10669
10670 pslldq m2, 1
10671 pinsrb m2, [r4 + 6], 0
10672 movh [r0 + 1072], m2
10673
10674 pslldq m2, 1
10675 pinsrb m2, [r4 + 7], 0
10676 movh [r0 + 1080], m2
10677
10678 ; mode 19 [row 0, 1]
10679
10680 movu m0, [r1]
10681 palignr m1, m0, 1
10682 punpcklbw m0, m1
10683
10684 pmaddubsw m1, m0, [r5 + 6 * 16]
10685 pmulhrsw m1, m7
10686
10687 pslldq m2, m0, 2
10688 pinsrb m2, [r2 + 0], 1
10689 pinsrb m2, [r2 + 1], 0
10690
10691 pmaddubsw m3, m2, [r5 + 12 * 16]
10692 pmulhrsw m3, m7
10693
10694 packuswb m1, m3
10695 movu [r0 + 1088], m1
10696
10697 ; mode 19 [row 2, 3]
10698
10699 pslldq m2, 2
10700 pinsrb m2, [r2 + 1], 1
10701 pinsrb m2, [r2 + 2], 0
10702
10703 pmaddubsw m4, m2, [r5 + 18 * 16]
10704 pmulhrsw m4, m7
10705
10706 pslldq m2, 2
10707 pinsrb m2, [r2 + 2], 1
10708 pinsrb m2, [r2 + 4], 0
10709
10710 pmaddubsw m5, m2, [r5 + 24 * 16]
10711 pmulhrsw m5, m7
10712
10713 packuswb m4, m5
10714 movu [r0 + 1104], m4
10715
10716 ; mode 19 [row 4, 5]
10717
10718 pslldq m2, 2
10719 pinsrb m2, [r2 + 4], 1
10720 pinsrb m2, [r2 + 5], 0
10721
10722 pmaddubsw m4, m2, [r5 + 30 * 16]
10723 pmulhrsw m4, m7
10724
10725 pmaddubsw m5, m2, [r5 + 4 * 16]
10726 pmulhrsw m5, m7
10727
10728 packuswb m4, m5
10729 movu [r0 + 1120], m4
10730
10731 ; mode 19 [row 6, 7]
10732
10733 pslldq m2, 2
10734 pinsrb m2, [r2 + 5], 1
10735 pinsrb m2, [r2 + 6], 0
10736
10737 pmaddubsw m4, m2, [r5 + 10 * 16]
10738 pmulhrsw m4, m7
10739
10740 pslldq m2, 2
10741 pinsrb m2, [r2 + 6], 1
10742 pinsrb m2, [r2 + 7], 0
10743
10744 pmaddubsw m2, [r5 + 16 * 16]
10745 pmulhrsw m2, m7
10746
10747 packuswb m4, m2
10748 movu [r0 + 1136], m4
10749
10750 ; mode 20 [row 0, 1]
10751
10752 pmaddubsw m3, m0, [r5 + 11 * 16]
10753 pmulhrsw m3, m7
10754
10755 pslldq m1, m0, 2
10756 pinsrb m1, [r2 + 0], 1
10757 pinsrb m1, [r2 + 2], 0
10758
10759 pmaddubsw m4, m1, [r5 + 22 * 16]
10760 pmulhrsw m4, m7
10761
10762 packuswb m3, m4
10763 movu [r0 + 1152], m3
10764
10765 ; mode 20 [row 2, 3]
10766
10767 pmaddubsw m3, m1, [r5 + 1 * 16]
10768 pmulhrsw m3, m7
10769
10770 pslldq m2, m1, 2
10771 pinsrb m2, [r2 + 2], 1
10772 pinsrb m2, [r2 + 3], 0
10773
10774 pmaddubsw m4, m2, [r5 + 12 * 16]
10775 pmulhrsw m4, m7
10776
10777 packuswb m3, m4
10778 movu [r0 + 1168], m3
10779
10780 ; mode 20 [row 4, 5]
10781
10782 pslldq m2, 2
10783 pinsrb m2, [r2 + 3], 1
10784 pinsrb m2, [r2 + 5], 0
10785
10786 pmaddubsw m3, m2, [r5 + 23 * 16]
10787 pmulhrsw m3, m7
10788
10789 pmaddubsw m4, m2, [r5 + 2 * 16]
10790 pmulhrsw m4, m7
10791
10792 packuswb m3, m4
10793 movu [r0 + 1184], m3
10794
10795 ; mode 20 [row 6, 7]
10796
10797 pslldq m2, 2
10798 pinsrb m2, [r2 + 5], 1
10799 pinsrb m2, [r2 + 6], 0
10800
10801 pmaddubsw m3, m2, [r5 + 13 * 16]
10802 pmulhrsw m3, m7
10803
10804 pslldq m2, 2
10805 pinsrb m2, [r2 + 6], 1
10806 pinsrb m2, [r2 + 8], 0
10807
10808 pmaddubsw m4, m2, [r5 + 24 * 16]
10809 pmulhrsw m4, m7
10810
10811 packuswb m3, m4
10812 movu [r0 + 1200], m3
10813
10814 ; mode 21 [row 0, 1]
10815
10816 pmaddubsw m2, m0, [r5 + 15 * 16]
10817 pmulhrsw m2, m7
10818
10819 pmaddubsw m3, m1, [r5 + 30 * 16]
10820 pmulhrsw m3, m7
10821
10822 packuswb m2, m3
10823 movu [r0 + 1216], m2
10824
10825 ; mode 21 [row 2, 3]
10826
10827 pmaddubsw m2, m1, [r5 + 13 * 16]
10828 pmulhrsw m2, m7
10829
10830 pslldq m3, m1, 2
10831 pinsrb m3, [r2 + 2], 1
10832 pinsrb m3, [r2 + 4], 0
10833
10834 pmaddubsw m4, m3, [r5 + 28 * 16]
10835 pmulhrsw m4, m7
10836
10837 packuswb m2, m4
10838 movu [r0 + 1232], m2
10839
10840 ; mode 21 [row 4, 5]
10841
10842 pmaddubsw m2, m3, [r5 + 11 * 16]
10843 pmulhrsw m2, m7
10844
10845 pslldq m3, 2
10846 pinsrb m3, [r2 + 4], 1
10847 pinsrb m3, [r2 + 6], 0
10848
10849 pmaddubsw m4, m3, [r5 + 26 * 16]
10850 pmulhrsw m4, m7
10851
10852 packuswb m2, m4
10853 movu [r0 + 1248], m2
10854
10855 ; mode 21 [row 6, 7]
10856
10857 pmaddubsw m2, m3, [r5 + 9 * 16]
10858 pmulhrsw m2, m7
10859
10860 pslldq m3, 2
10861 pinsrb m3, [r2 + 6], 1
10862 pinsrb m3, [r2 + 8], 0
10863
10864 pmaddubsw m4, m3, [r5 + 24 * 16]
10865 pmulhrsw m4, m7
10866
10867 packuswb m2, m4
10868 movu [r0 + 1264], m2
10869
10870 ; mode 22 [row 0, 1]
10871
10872 pmaddubsw m2, m0, [r5 + 19 * 16]
10873 pmulhrsw m2, m7
10874
10875 pmaddubsw m4, m0, [r5 + 6 * 16]
10876 pmulhrsw m4, m7
10877
10878 packuswb m2, m4
10879 movu [r0 + 1280], m2
10880
10881 ; mode 22 [row 2, 3]
10882
10883 pmaddubsw m2, m1, [r5 + 25 * 16]
10884 pmulhrsw m2, m7
10885
10886 pmaddubsw m3, m1, [r5 + 12 * 16]
10887 pmulhrsw m3, m7
10888
10889 packuswb m2, m3
10890 movu [r0 + 1296], m2
10891
10892 ; mode 22 [row 4, 5]
10893
10894 pslldq m1, 2
10895 pinsrb m1, [r2 + 5], 0
10896 pinsrb m1, [r2 + 2], 1
10897
10898 pmaddubsw m2, m1, [r5 + 31 * 16]
10899 pmulhrsw m2, m7
10900
10901 pmaddubsw m3, m1, [r5 + 18 * 16]
10902 pmulhrsw m3, m7
10903
10904 packuswb m2, m3
10905 movu [r0 + 1312], m2
10906
10907 ; mode 22 [row 6, 7]
10908
10909 pmaddubsw m2, m1, [r5 + 5 * 16]
10910 pmulhrsw m2, m7
10911
10912 pslldq m1, 2
10913 pinsrb m1, [r2 + 5], 1
10914 pinsrb m1, [r2 + 7], 0
10915
10916 pmaddubsw m1, [r5 + 24 * 16]
10917 pmulhrsw m1, m7
10918
10919 packuswb m2, m1
10920 movu [r0 + 1328], m2
10921
10922 ; mode 23 [row 0, 1]
10923
10924 pmaddubsw m2, m0, [r5 + 23 * 16]
10925 pmulhrsw m2, m7
10926
10927 pmaddubsw m3, m0, [r5 + 14 * 16]
10928 pmulhrsw m3, m7
10929
10930 packuswb m2, m3
10931 movu [r0 + 1344], m2
10932
10933 ; mode 23 [row 2, 3]
10934
10935 pmaddubsw m2, m0, [r5 + 5 * 16]
10936 pmulhrsw m2, m7
10937
10938 pslldq m1, m0, 2
10939 pinsrb m1, [r2 + 0], 1
10940 pinsrb m1, [r2 + 4], 0
10941
10942 pmaddubsw m3, m1, [r5 + 28 * 16]
10943 pmulhrsw m3, m7
10944
10945 packuswb m2, m3
10946 movu [r0 + 1360], m2
10947
10948 ; mode 23 [row 4, 5]
10949
10950 pmaddubsw m2, m1, [r5 + 19 * 16]
10951 pmulhrsw m2, m7
10952
10953 pmaddubsw m3, m1, [r5 + 10 * 16]
10954 pmulhrsw m3, m7
10955
10956 packuswb m2, m3
10957 movu [r0 + 1376], m2
10958
10959 ; mode 23 [row 6, 7]
10960
10961 pmaddubsw m2, m1, [r5 + 1 * 16]
10962 pmulhrsw m2, m7
10963
10964 pslldq m3, m1, 2
10965 pinsrb m3, [r2 + 4], 1
10966 pinsrb m3, [r2 + 7], 0
10967
10968 pmaddubsw m3, [r5 + 24 * 16]
10969 pmulhrsw m3, m7
10970
10971 packuswb m2, m3
10972 movu [r0 + 1392], m2
10973
10974 ; mode 24 [row 0, 1]
10975
10976 pmaddubsw m2, m0, [r5 + 27 * 16]
10977 pmulhrsw m2, m7
10978
10979 pmaddubsw m5, m0, [r5 + 22 * 16]
10980 pmulhrsw m5, m7
10981
10982 packuswb m2, m5
10983 movu [r0 + 1408], m2
10984
10985 ; mode 24 [row 2, 3]
10986
10987 pmaddubsw m2, m0, [r5 + 17 * 16]
10988 pmulhrsw m2, m7
10989
10990 pmaddubsw m3, m0, [r5 + 12 * 16]
10991 pmulhrsw m3, m7
10992
10993 packuswb m2, m3
10994 movu [r0 + 1424], m2
10995
10996 ; mode 24 [row 4, 5]
10997
10998 pmaddubsw m2, m0, [r5 + 7 * 16]
10999 pmulhrsw m2, m7
11000
11001 pmaddubsw m3, m0, [r5 + 2 * 16]
11002 pmulhrsw m3, m7
11003
11004 packuswb m2, m3
11005 movu [r0 + 1440], m2
11006
11007 ; mode 24 [row 6, 7]
11008
11009 pinsrb m1, [r2 + 6], 0
11010
11011 pmaddubsw m2, m1, [r5 + 29 * 16]
11012 pmulhrsw m2, m7
11013
11014 pmaddubsw m1, [r5 + 24 * 16]
11015 pmulhrsw m1, m7
11016
11017 packuswb m2, m1
11018 movu [r0 + 1456], m2
11019
11020 ; mode 25 [row 0, 1]
11021
11022 pmaddubsw m2, m0, [r5 + 30 * 16]
11023 pmulhrsw m2, m7
11024
11025 pmaddubsw m1, m0, [r5 + 28 * 16]
11026 pmulhrsw m1, m7
11027
11028 packuswb m2, m1
11029 movu [r0 + 1472], m2
11030
11031 ; mode 25 [row 2, 3]
11032
11033 pmaddubsw m2, m0, [r5 + 26 * 16]
11034 pmulhrsw m2, m7
11035
11036 pmaddubsw m1, m0, [r5 + 24 * 16]
11037 pmulhrsw m1, m7
11038
11039 packuswb m2, m1
11040 movu [r0 + 1488], m2
11041
11042 ; mode 25 [row 4, 5]
11043
11044 pmaddubsw m1, m0, [r5 + 20 * 16]
11045 pmulhrsw m1, m7
11046
11047 packuswb m5, m1
11048 movu [r0 + 1504], m5
11049
11050 ; mode 25 [row 6, 7]
11051
11052 pmaddubsw m2, m0, [r5 + 18 * 16]
11053 pmulhrsw m2, m7
11054
11055 pmaddubsw m1, m0, [r5 + 16 * 16]
11056 pmulhrsw m1, m7
11057
11058 packuswb m2, m1
11059 movu [r0 + 1520], m2
11060
11061 ; mode 26
11062
11063 movu m0, [r1 + 1]
11064
11065 pshufb m1, m0, [tab_Si]
11066 movu [r0 + 1536], m1
11067 movu [r0 + 1552], m1
11068 movu [r0 + 1568], m1
11069 movu [r0 + 1584], m1
11070
11071 pxor m5, m5
11072
11073 pshufb m1, m1, m5
11074 punpcklbw m1, m5
11075
11076 movu m2, [r2]
11077
11078 pshufb m3, m2, m5
11079 punpcklbw m3, m5
11080
11081 psrldq m4, m2, 1
11082 punpcklbw m4, m5
11083
11084 movu m2, [r2 + 9]
11085 punpcklbw m2, m5
11086
11087 psubw m4, m3
11088 psubw m2, m3
11089
11090 psraw m4, 1
11091 psraw m2, 1
11092
11093 paddw m4, m1
11094 paddw m2, m1
11095
11096 packuswb m4, m2
11097
11098 pextrb [r0 + 1536], m4, 0
11099 pextrb [r0 + 1544], m4, 1
11100 pextrb [r0 + 1552], m4, 2
11101 pextrb [r0 + 1560], m4, 3
11102 pextrb [r0 + 1568], m4, 4
11103 pextrb [r0 + 1576], m4, 5
11104 pextrb [r0 + 1584], m4, 6
11105 pextrb [r0 + 1592], m4, 7
11106
11107 ; mode 27 [row 0, 1]
11108
11109 palignr m6, m0, 1
11110 punpcklbw m4, m0, m6
11111
11112 pmaddubsw m1, m4, [r5 + 2 * 16]
11113 pmulhrsw m1, m7
11114
11115 pmaddubsw m2, m4, [r5 + 4 * 16]
11116 pmulhrsw m2, m7
11117
11118 packuswb m1, m2
11119 movu [r0 + 1600], m1
11120
11121 ; mode 27 [row 2, 3]
11122
11123 pmaddubsw m1, m4, [r5 + 6 * 16]
11124 pmulhrsw m1, m7
11125
11126 pmaddubsw m2, m4, [r5 + 8 * 16]
11127 pmulhrsw m2, m7
11128
11129 packuswb m1, m2
11130 movu [r0 + 1616], m1
11131
11132 ; mode 27 [row 4, 5]
11133
11134 pmaddubsw m3, m4, [r5 + 10 * 16]
11135 pmulhrsw m3, m7
11136
11137 pmaddubsw m2, m4, [r5 + 12 * 16]
11138 pmulhrsw m2, m7
11139
11140 packuswb m1, m3, m2
11141 movu [r0 + 1632], m1
11142
11143 ; mode 27 [row 6, 7]
11144
11145 pmaddubsw m1, m4, [r5 + 14 * 16]
11146 pmulhrsw m1, m7
11147
11148 pmaddubsw m2, m4, [r5 + 16 * 16]
11149 pmulhrsw m2, m7
11150
11151 packuswb m1, m2
11152 movu [r0 + 1648], m1
11153
11154 ; mode 28 [row 0, 1]
11155
11156 pmaddubsw m1, m4, [r5 + 5 * 16]
11157 pmulhrsw m1, m7
11158
11159 packuswb m1, m3
11160 movu [r0 + 1664], m1
11161
11162 ; mode 28 [row 2, 3]
11163
11164 pmaddubsw m1, m4, [r5 + 15 * 16]
11165 pmulhrsw m1, m7
11166
11167 pmaddubsw m2, m4, [r5 + 20 * 16]
11168 pmulhrsw m2, m7
11169
11170 packuswb m1, m2
11171 movu [r0 + 1680], m1
11172
11173 ; mode 28 [row 4, 5]
11174
11175 pmaddubsw m1, m4, [r5 + 25 * 16]
11176 pmulhrsw m1, m7
11177
11178 pmaddubsw m2, m4, [r5 + 30 * 16]
11179 pmulhrsw m2, m7
11180
11181 packuswb m1, m2
11182 movu [r0 + 1696], m1
11183
11184 ; mode 28 [row 6, 7]
11185
11186 palignr m1, m0, 2
11187 punpcklbw m5, m6, m1
11188
11189 pmaddubsw m2, m5, [r5 + 3 * 16]
11190 pmulhrsw m2, m7
11191
11192 pmaddubsw m3, m5, [r5 + 8 * 16]
11193 pmulhrsw m3, m7
11194
11195 packuswb m2, m3
11196 movu [r0 + 1712], m2
11197
11198 ; mode 29 [row 0, 1]
11199
11200 pmaddubsw m2, m4, [r5 + 9 * 16]
11201 pmulhrsw m2, m7
11202
11203 pmaddubsw m3, m4, [r5 + 18 * 16]
11204 pmulhrsw m3, m7
11205
11206 packuswb m2, m3
11207 movu [r0 + 1728], m2
11208
11209 ; mode 29 [row 2, 3]
11210
11211 pmaddubsw m2, m4, [r5 + 27 * 16]
11212 pmulhrsw m2, m7
11213
11214 pmaddubsw m3, m5, [r5 + 4 * 16]
11215 pmulhrsw m3, m7
11216
11217 packuswb m2, m3
11218 movu [r0 + 1744], m2
11219
11220 ; mode 29 [row 4, 5]
11221
11222 pmaddubsw m2, m5, [r5 + 13 * 16]
11223 pmulhrsw m2, m7
11224
11225 pmaddubsw m3, m5, [r5 + 22 * 16]
11226 pmulhrsw m3, m7
11227
11228 packuswb m2, m3
11229 movu [r0 + 1760], m2
11230
11231 ; mode 29 [row 6, 7]
11232
11233 pmaddubsw m2, m5, [r5 + 31 * 16]
11234 pmulhrsw m2, m7
11235
11236 palignr m6, m0, 3
11237 punpcklbw m1, m6
11238
11239 pmaddubsw m3, m1, [r5 + 8 * 16]
11240 pmulhrsw m3, m7
11241
11242 packuswb m2, m3
11243 movu [r0 + 1776], m2
11244
11245 ; mode 32 [row 2]
11246
11247 movh [r0 + 1936], m2
11248
11249 ; mode 30 [row 0, 1]
11250
11251 pmaddubsw m2, m4, [r5 + 13 * 16]
11252 pmulhrsw m2, m7
11253
11254 pmaddubsw m3, m4, [r5 + 26 * 16]
11255 pmulhrsw m3, m7
11256
11257 packuswb m2, m3
11258 movu [r0 + 1792], m2
11259
11260 ; mode 30 [row 2, 3]
11261
11262 pmaddubsw m2, m5, [r5 + 7 * 16]
11263 pmulhrsw m2, m7
11264
11265 pmaddubsw m3, m5, [r5 + 20 * 16]
11266 pmulhrsw m3, m7
11267
11268 packuswb m2, m3
11269 movu [r0 + 1808], m2
11270
11271 ; mode 33 [row 1]
11272
11273 movhps [r0 + 1992], m2
11274
11275 ; mode 30 [row 4, 5]
11276
11277 pmaddubsw m2, m1, [r5 + 1 * 16]
11278 pmulhrsw m2, m7
11279
11280 pmaddubsw m3, m1, [r5 + 14 * 16]
11281 pmulhrsw m3, m7
11282
11283 packuswb m2, m3
11284 movu [r0 + 1824], m2
11285
11286 ; mode 33 [row 2]
11287
11288 movhps [r0 + 2000], m2
11289
11290 ; mode 30 [row 6, 7]
11291
11292 pmaddubsw m2, m1, [r5 + 27 * 16]
11293 pmulhrsw m2, m7
11294
11295 psrldq m0, 4
11296 punpcklbw m6, m0
11297
11298 pmaddubsw m3, m6, [r5 + 8 * 16]
11299 pmulhrsw m3, m7
11300
11301 packuswb m2, m3
11302 movu [r0 + 1840], m2
11303
11304 ; mode 33 [row 3]
11305
11306 movhps [r0 + 2008], m2
11307
11308 ; mode 31 [row 0, 1]
11309
11310 pmaddubsw m2, m4, [r5 + 17 * 16]
11311 pmulhrsw m2, m7
11312
11313 pmaddubsw m3, m5, [r5 + 2 * 16]
11314 pmulhrsw m3, m7
11315
11316 packuswb m2, m3
11317 movu [r0 + 1856], m2
11318
11319 ; mode 31 [row 2, 3]
11320
11321 pmaddubsw m2, m5, [r5 + 19 * 16]
11322 pmulhrsw m2, m7
11323
11324 pmaddubsw m3, m1, [r5 + 4 * 16]
11325 pmulhrsw m3, m7
11326
11327 packuswb m2, m3
11328 movu [r0 + 1872], m2
11329
11330 ; mode 31 [row 4, 5]
11331
11332 pmaddubsw m2, m1, [r5 + 21 * 16]
11333 pmulhrsw m2, m7
11334
11335 pmaddubsw m3, m6, [r5 + 6 * 16]
11336 pmulhrsw m3, m7
11337
11338 packuswb m2, m3
11339 movu [r0 + 1888], m2
11340
11341 ; mode 31 [row 6, 7]
11342
11343 pmaddubsw m2, m6, [r5 + 23 * 16]
11344 pmulhrsw m2, m7
11345
11346 movu m3, [r1 + 6]
11347 punpcklbw m0, m3
11348
11349 pmaddubsw m3, m0, [r5 + 8 * 16]
11350 pmulhrsw m3, m7
11351
11352 packuswb m2, m3
11353 movu [r0 + 1904], m2
11354
11355 ; mode 32 [row 0, 1]
11356
11357 pmaddubsw m2, m4, [r5 + 21 * 16]
11358 pmulhrsw m2, m7
11359
11360 pmaddubsw m3, m5, [r5 + 10 * 16]
11361 pmulhrsw m3, m7
11362
11363 packuswb m2, m3
11364 movu [r0 + 1920], m2
11365
11366 ; mode 32 [row 3]
11367
11368 pmaddubsw m2, m1, [r5 + 20 * 16]
11369 pmulhrsw m2, m7
11370
11371 pxor m3, m3
11372
11373 packuswb m2, m3
11374 movh [r0 + 1944], m2
11375
11376 ; mode 32 [row 4, 5]
11377
11378 pmaddubsw m2, m6, [r5 + 9 * 16]
11379 pmulhrsw m2, m7
11380
11381 pmaddubsw m3, m6, [r5 + 30 * 16]
11382 pmulhrsw m3, m7
11383
11384 packuswb m2, m3
11385 movu [r0 + 1952], m2
11386
11387 ; mode 33 [row 4, 5]
11388
11389 pmaddubsw m2, m0, [r5 + 2 * 16]
11390 pmulhrsw m2, m7
11391
11392 pmaddubsw m3, m0, [r5 + 28 * 16]
11393 pmulhrsw m3, m7
11394
11395 packuswb m2, m3
11396 movu [r0 + 2016], m2
11397
11398 ; mode 32 [row 6]
11399
11400 pmaddubsw m2, m0, [r5 + 19 * 16]
11401 pmulhrsw m2, m7
11402
11403 ; mode 32 [row 7]
11404
11405 movu m0, [r1 + 6]
11406 palignr m3, m0, 1
11407 punpcklbw m0, m3
11408
11409 pmaddubsw m3, m0, [r5 + 8 * 16]
11410 pmulhrsw m3, m7
11411
11412 packuswb m2, m3
11413 movu [r0 + 1968], m2
11414
11415 ; mode 33 [row 6, 7]
11416
11417 pmaddubsw m2, m0, [r5 + 22 * 16]
11418 pmulhrsw m2, m7
11419
11420 movu m0, [r1 + 7]
11421 palignr m3, m0, 1
11422 punpcklbw m0, m3
11423
11424 pmaddubsw m3, m0, [r5 + 16 * 16]
11425 pmulhrsw m3, m7
11426
11427 packuswb m2, m3
11428 movu [r0 + 2032], m2
11429
11430 ; mode 33 [row 0]
11431
11432 pmaddubsw m2, m4, [r5 + 26 * 16]
11433 pmulhrsw m2, m7
11434
11435 pxor m3, m3
11436
11437 packuswb m2, m3
11438 movh [r0 + 1984], m2
11439
11440 ; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7]
11441
11442 movu m0, [r3 + 2]
11443 palignr m1, m0, 1
11444 punpcklqdq m2, m0, m1
11445 movu [r0 + 2048], m2
11446
11447 palignr m1, m0, 2
11448 palignr m2, m0, 3
11449 punpcklqdq m1, m2
11450 movu [r0 + 2064], m1
11451
11452 palignr m1, m0, 4
11453 palignr m2, m0, 5
11454 punpcklqdq m1, m2
11455 movu [r0 + 2080], m1
11456
11457 palignr m1, m0, 6
11458 palignr m2, m0, 7
11459 punpcklqdq m1, m2
11460 movu [r0 + 2096], m1
11461
11462 RET
11463
11464 ;-----------------------------------------------------------------------------
11465 ; void all_angs_pred_16x16(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
11466 ;-----------------------------------------------------------------------------
11467 INIT_XMM sse4
11468 cglobal all_angs_pred_16x16, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
11469
11470 movu m0, [r4 + 2]
11471 movu [r0 + 0 * 16], m0
11472
11473 movu m1, m0
11474
11475 movu m6, [r4 + 18]
11476 palignr m5, m6, m0, 1
11477 movu [r0 + 1 * 16], m5
11478
11479 movu m4, m5
11480
11481 palignr m5, m6, m0, 2
11482 movu [r0 + 2 * 16], m5
11483 palignr m5, m6, m0, 3
11484 movu [r0 + 3 * 16], m5
11485 palignr m5, m6, m0, 4
11486 movu [r0 + 4 * 16], m5
11487 palignr m5, m6, m0, 5
11488 movu [r0 + 5 * 16], m5
11489 palignr m5, m6, m0, 6
11490 movu [r0 + 6 * 16], m5
11491 palignr m5, m6, m0, 7
11492 movu [r0 + 7 * 16], m5
11493
11494 movu m7, m5
11495
11496 palignr m5, m6, m0, 8
11497 movu [r0 + 8 * 16], m5
11498
11499 movu m2, m5
11500
11501 palignr m5, m6, m0, 9
11502 movu [r0 + 9 * 16], m5
11503
11504 palignr m3, m6, m0, 10
11505 movu [r0 + 10 * 16], m3
11506 palignr m3, m6, m0, 11
11507 movu [r0 + 11 * 16], m3
11508 palignr m3, m6, m0, 12
11509 movu [r0 + 12 * 16], m3
11510
11511 ; mode 3 [row 15]
11512 movu [r0 + (3-2)*16*16 + 15 * 16], m3
11513
11514 palignr m3, m6, m0, 13
11515 movu [r0 + 13 * 16], m3
11516 palignr m3, m6, m0, 14
11517 movu [r0 + 14 * 16], m3
11518 palignr m3, m6, m0, 15
11519 movu [r0 + 15 * 16], m3
11520
11521 ; mode 3 [row 0]
11522 lea r5, [ang_table]
11523 movu m3, [pw_1024]
11524 movu m0, [r4 + 1]
11525 punpcklbw m0, m1
11526
11527 ; mode 17 [row 8 - second half]
11528 pmaddubsw m1, m0, [r5 + 22 * 16]
11529 pmulhrsw m1, m3
11530 packuswb m1, m1
11531 movh [r0 + 248 * 16 + 8], m1
11532 ; mode 17 [row 8 - second half] end
11533
11534 pmaddubsw m1, m0, [r5 + 26 * 16]
11535 pmulhrsw m1, m3
11536 punpcklbw m7, m2
11537 pmaddubsw m2, m7, [r5 + 26 * 16]
11538 pmulhrsw m2, m3
11539 packuswb m1, m2
11540 movu [r0 + 16 * 16], m1
11541
11542 ;mode 6 [row 1]
11543 movu [r0 + 65 * 16], m1
11544
11545 ; mode 4 [row 0]
11546 pmaddubsw m1, m0, [r5 + 21 * 16]
11547 pmulhrsw m1, m3
11548 pmaddubsw m2, m7, [r5 + 21 * 16]
11549 pmulhrsw m2, m3
11550 packuswb m1, m2
11551 movu [r0 + 32 * 16], m1
11552
11553 ; mode 5 [row 0]
11554 pmaddubsw m1, m0, [r5 + 17 * 16]
11555 pmulhrsw m1, m3
11556 pmaddubsw m2, m7, [r5 + 17 * 16]
11557 pmulhrsw m2, m3
11558 packuswb m1, m2
11559 movu [r0 + 48 * 16], m1
11560
11561 ; mode 6 [row 0]
11562 pmaddubsw m1, m0, [r5 + 13 * 16]
11563 pmulhrsw m1, m3
11564 pmaddubsw m2, m7, [r5 + 13 * 16]
11565 pmulhrsw m2, m3
11566 packuswb m1, m2
11567 movu [r0 + 64 * 16], m1
11568
11569 ; mode 7 [row 0]
11570 pmaddubsw m1, m0, [r5 + 9 * 16]
11571 pmulhrsw m1, m3
11572 pmaddubsw m2, m7, [r5 + 9 * 16]
11573 pmulhrsw m2, m3
11574 packuswb m1, m2
11575 movu [r0 + 80 * 16], m1
11576
11577 ; mode 7 [row 1]
11578 pmaddubsw m1, m0, [r5 + 18 * 16]
11579 pmulhrsw m1, m3
11580 pmaddubsw m2, m7, [r5 + 18 * 16]
11581 pmulhrsw m2, m3
11582 packuswb m1, m2
11583 movu [r0 + 81 * 16], m1
11584
11585 ; mode 7 [row 2]
11586 pmaddubsw m1, m0, [r5 + 27 * 16]
11587 pmulhrsw m1, m3
11588 pmaddubsw m2, m7, [r5 + 27 * 16]
11589 pmulhrsw m2, m3
11590 packuswb m1, m2
11591 movu [r0 + 82 * 16], m1
11592
11593 ; mode 8 [row 0]
11594 pmaddubsw m1, m0, [r5 + 5 * 16]
11595 pmulhrsw m1, m3
11596 pmaddubsw m2, m7, [r5 + 5 * 16]
11597 pmulhrsw m2, m3
11598 packuswb m1, m2
11599 movu [r0 + 96 * 16], m1
11600
11601 ; mode 8 [row 1]
11602 pmaddubsw m1, m0, [r5 + 10 * 16]
11603 pmulhrsw m1, m3
11604 pmaddubsw m2, m7, [r5 + 10 * 16]
11605 pmulhrsw m2, m3
11606 packuswb m1, m2
11607 movu [r0 + 97 * 16], m1
11608
11609 ; mode 8 [row 2]
11610 pmaddubsw m1, m0, [r5 + 15 * 16]
11611 pmulhrsw m1, m3
11612 pmaddubsw m2, m7, [r5 + 15 * 16]
11613 pmulhrsw m2, m3
11614 packuswb m1, m2
11615 movu [r0 + 98 * 16], m1
11616
11617 ; mode 8 [row 3]
11618 pmaddubsw m1, m0, [r5 + 20 * 16]
11619 pmulhrsw m1, m3
11620 pmaddubsw m2, m7, [r5 + 20 * 16]
11621 pmulhrsw m2, m3
11622 packuswb m1, m2
11623 movu [r0 + 99 * 16], m1
11624
11625 ; mode 8 [row 4]
11626 pmaddubsw m1, m0, [r5 + 25 * 16]
11627 pmulhrsw m1, m3
11628 pmaddubsw m2, m7, [r5 + 25 * 16]
11629 pmulhrsw m2, m3
11630 packuswb m1, m2
11631 movu [r0 + 100 * 16], m1
11632
11633 ; mode 8 [row 5]
11634 pmaddubsw m1, m0, [r5 + 30 * 16]
11635 pmulhrsw m1, m3
11636 pmaddubsw m2, m7, [r5 + 30 * 16]
11637 pmulhrsw m2, m3
11638 packuswb m1, m2
11639 movu [r0 + 101 * 16], m1
11640
11641 ; mode 15 [row 13 - second half]
11642 pmaddubsw m1, m0, [r5 + 18 * 16]
11643 pmulhrsw m1, m3
11644 packuswb m1, m1
11645 movh [r0 + 221 * 16 + 8], m1
11646 ; mode 15 [row 13 - second half] end
11647
11648 ; mode 15 [row 14 - second half]
11649 pmaddubsw m1, m0, [r5 + 1 * 16]
11650 pmulhrsw m1, m3
11651 packuswb m1, m1
11652 movh [r0 + 222 * 16 + 8], m1
11653 ; mode 15 [row 14 - second half] end
11654
11655 ; mode 16 [row 10 - second half]
11656 pmaddubsw m1, m0, [r5 + 25 * 16]
11657 pmulhrsw m1, m3
11658 packuswb m1, m1
11659 movh [r0 + 234 * 16 + 8], m1
11660 ; mode 16 [row 10 - second half] end
11661
11662 ; mode 16 [row 11 - second half]
11663 pmaddubsw m1, m0, [r5 + 4 * 16]
11664 pmulhrsw m1, m3
11665 packuswb m1, m1
11666 movh [r0 + 235 * 16 + 8], m1
11667 ; mode 16 [row 11 - second half] end
11668
11669 ; mode 3 [row 1]
11670 movu m6, [r5 + 20 * 16]
11671 movu m0, [r4 + 2]
11672 punpcklbw m0, m4
11673
11674 ; mode 17 [row 7 - second half]
11675 pmaddubsw m1, m0, [r5 + 16 * 16]
11676 pmulhrsw m1, m3
11677 packuswb m1, m1
11678 movh [r0 + 247 * 16 + 8], m1
11679
11680 ; mode 17 [row 7 - second half] end
11681 pmaddubsw m1, m0, m6
11682 pmulhrsw m1, m3
11683 movu m2, [r4 + 10]
11684 punpcklbw m2, m5
11685 pmaddubsw m4, m2, m6
11686 pmulhrsw m4, m3
11687 packuswb m1, m4
11688 movu [r0 + 17 * 16], m1
11689
11690 ;mode 6 [row 3]
11691 movu [r0 + 67 * 16], m1
11692
11693 ; mode 4 row [row 1]
11694 pmaddubsw m1, m0, [r5 + 10 * 16]
11695 pmulhrsw m1, m3
11696 pmaddubsw m4, m2, [r5 + 10 * 16]
11697 pmulhrsw m4, m3
11698 packuswb m1, m4
11699 movu [r0 + 33 * 16], m1
11700
11701 ; mode 4 row [row 2]
11702 pmaddubsw m1, m0, [r5 + 31 * 16]
11703 pmulhrsw m1, m3
11704 pmaddubsw m4, m2, [r5 + 31 * 16]
11705 pmulhrsw m4, m3
11706 packuswb m1, m4
11707 movu [r0 + 34 * 16], m1
11708
11709 ; mode 7 [row 6]
11710 movu [r0 + 86 * 16], m1
11711
11712 ; mode 5 row [row 1]
11713 pmaddubsw m1, m0, [r5 + 2 * 16]
11714 pmulhrsw m1, m3
11715 pmaddubsw m4, m2, [r5 + 2 * 16]
11716 pmulhrsw m4, m3
11717 packuswb m1, m4
11718 movu [r0 + 49 * 16], m1
11719
11720 ; mode 5 row [row 2]
11721 pmaddubsw m1, m0, [r5 + 19 * 16]
11722 pmulhrsw m1, m3
11723 pmaddubsw m4, m2, [r5 + 19 * 16]
11724 pmulhrsw m4, m3
11725 packuswb m1, m4
11726 movu [r0 + 50 * 16], m1
11727
11728 ; mode 6 [row 2]
11729 pmaddubsw m1, m0, [r5 + 7 * 16]
11730 pmulhrsw m1, m3
11731 pmaddubsw m4, m2, [r5 + 7 * 16]
11732 pmulhrsw m4, m3
11733 packuswb m1, m4
11734 movu [r0 + 66 * 16], m1
11735
11736 ; mode 7 [row 3]
11737 pmaddubsw m1, m0, [r5 + 4 * 16]
11738 pmulhrsw m1, m3
11739 pmaddubsw m4, m2, [r5 + 4 * 16]
11740 pmulhrsw m4, m3
11741 packuswb m1, m4
11742 movu [r0 + 83 * 16], m1
11743
11744 ; mode 7 [row 4]
11745 pmaddubsw m1, m0, [r5 + 13 * 16]
11746 pmulhrsw m1, m3
11747 pmaddubsw m4, m2, [r5 + 13 * 16]
11748 pmulhrsw m4, m3
11749 packuswb m1, m4
11750 movu [r0 + 84 * 16], m1
11751
11752 ; mode 8 [row 8]
11753 movu [r0 + 104 * 16], m1
11754
11755 ; mode 7 [row 5]
11756 pmaddubsw m1, m0, [r5 + 22 * 16]
11757 pmulhrsw m1, m3
11758 pmaddubsw m4, m2, [r5 + 22 * 16]
11759 pmulhrsw m4, m3
11760 packuswb m1, m4
11761 movu [r0 + 85 * 16], m1
11762
11763 ; mode 8 [row 6]
11764 pmaddubsw m1, m0, [r5 + 3 * 16]
11765 pmulhrsw m1, m3
11766 pmaddubsw m4, m2, [r5 + 3 * 16]
11767 pmulhrsw m4, m3
11768 packuswb m1, m4
11769 movu [r0 + 102 * 16], m1
11770
11771 ; mode 8 [row 7]
11772 pmaddubsw m1, m0, [r5 + 8 * 16]
11773 pmulhrsw m1, m3
11774 pmaddubsw m4, m2, [r5 + 8 * 16]
11775 pmulhrsw m4, m3
11776 packuswb m1, m4
11777 movu [r0 + 103 * 16], m1
11778
11779 ; mode 8 [row 9]
11780 pmaddubsw m1, m0, [r5 + 18 * 16]
11781 pmulhrsw m1, m3
11782 pmaddubsw m4, m2, [r5 + 18 * 16]
11783 pmulhrsw m4, m3
11784 packuswb m1, m4
11785 movu [r0 + 105 * 16], m1
11786
11787 ; mode 8 [row 10]
11788 pmaddubsw m1, m0, [r5 + 23 * 16]
11789 pmulhrsw m1, m3
11790 pmaddubsw m4, m2, [r5 + 23 * 16]
11791 pmulhrsw m4, m3
11792 packuswb m1, m4
11793 movu [r0 + 106 * 16], m1
11794
11795 ; mode 8 [row 11]
11796 pmaddubsw m1, m0, [r5 + 28 * 16]
11797 pmulhrsw m1, m3
11798 pmaddubsw m4, m2, [r5 + 28 * 16]
11799 pmulhrsw m4, m3
11800 packuswb m1, m4
11801 movu [r0 + 107 * 16], m1
11802
11803 ; mode 3 [row 2]
11804 movu m0, [r4 + 3]
11805 movd m1, [r4 + 19]
11806 palignr m1, m0, 1
11807 punpcklbw m0, m1
11808
11809 ; mode 17 [row 6 - second half]
11810 pmaddubsw m1, m0, [r5 + 10 * 16]
11811 pmulhrsw m1, m3
11812 packuswb m1, m1
11813 movh [r0 + 246 * 16 + 8], m1
11814 ; mode 17 [row 6 - second half] end
11815
11816 pmaddubsw m1, m0, [r5 + 14 * 16]
11817 pmulhrsw m1, m3
11818 movu m2, [r4 + 11]
11819 movd m4, [r4 + 27]
11820 palignr m4, m2, 1
11821 punpcklbw m2, m4
11822 pmaddubsw m4, m2, [r5 + 14 * 16]
11823 pmulhrsw m4, m3
11824 packuswb m1, m4
11825 movu [r0 + 18 * 16], m1
11826
11827 ; mode 6 [row 5]
11828 movu [r0 + 69 * 16], m1
11829
11830 ; mode 4 row [row 3]
11831 pmaddubsw m1, m0, [r5 + 20 * 16]
11832 pmulhrsw m1, m3
11833 pmaddubsw m4, m2, [r5 + 20 * 16]
11834 pmulhrsw m4, m3
11835 packuswb m1, m4
11836 movu [r0 + 35 * 16], m1
11837
11838 ; mode 5 row [row 3]
11839 pmaddubsw m1, m0, [r5 + 4 * 16]
11840 pmulhrsw m1, m3
11841 pmaddubsw m4, m2, [r5 + 4 * 16]
11842 pmulhrsw m4, m3
11843 packuswb m1, m4
11844 movu [r0 + 51 * 16], m1
11845
11846 ; mode 5 row [row 4]
11847 pmaddubsw m1, m0, [r5 + 21 * 16]
11848 pmulhrsw m1, m3
11849 pmaddubsw m4, m2, [r5 + 21 * 16]
11850 pmulhrsw m4, m3
11851 packuswb m1, m4
11852 movu [r0 + 52 * 16], m1
11853
11854 ; mode 6 [row 4]
11855 pmaddubsw m1, m0, [r5 + 1 * 16]
11856 pmulhrsw m1, m3
11857 pmaddubsw m4, m2, [r5 + 1 * 16]
11858 pmulhrsw m4, m3
11859 packuswb m1, m4
11860 movu [r0 + 68 * 16], m1
11861
11862 ; mode 6 [row 6]
11863 pmaddubsw m1, m0, [r5 + 27 * 16]
11864 pmulhrsw m1, m3
11865 pmaddubsw m4, m2, [r5 + 27 * 16]
11866 pmulhrsw m4, m3
11867 packuswb m1, m4
11868 movu [r0 + 70 * 16], m1
11869
11870 ; mode 7 [row 7]
11871 pmaddubsw m1, m0, [r5 + 8 * 16]
11872 pmulhrsw m1, m3
11873 pmaddubsw m4, m2, [r5 + 8 * 16]
11874 pmulhrsw m4, m3
11875 packuswb m1, m4
11876 movu [r0 + 87 * 16], m1
11877
11878 ; mode 7 [row 8]
11879 pmaddubsw m1, m0, [r5 + 17 * 16]
11880 pmulhrsw m1, m3
11881 pmaddubsw m4, m2, [r5 + 17 * 16]
11882 pmulhrsw m4, m3
11883 packuswb m1, m4
11884 movu [r0 + 88 * 16], m1
11885
11886 ; mode 7 [row 9]
11887 pmaddubsw m1, m0, [r5 + 26 * 16]
11888 pmulhrsw m1, m3
11889 pmaddubsw m4, m2, [r5 + 26 * 16]
11890 pmulhrsw m4, m3
11891 packuswb m1, m4
11892 movu [r0 + 89 * 16], m1
11893
11894 ; mode 8 [row 12]
11895 pmaddubsw m1, m0, [r5 + 1 * 16]
11896 pmulhrsw m1, m3
11897 pmaddubsw m4, m2, [r5 + 1 * 16]
11898 pmulhrsw m4, m3
11899 packuswb m1, m4
11900 movu [r0 + 108 * 16], m1
11901
11902 ; mode 8 [row 13]
11903 pmaddubsw m1, m0, [r5 + 6 * 16]
11904 pmulhrsw m1, m3
11905 pmaddubsw m4, m2, [r5 + 6 * 16]
11906 pmulhrsw m4, m3
11907 packuswb m1, m4
11908 movu [r0 + 109 * 16], m1
11909
11910 ; mode 8 [row 14]
11911 pmaddubsw m1, m0, [r5 + 11 * 16]
11912 pmulhrsw m1, m3
11913 pmaddubsw m4, m2, [r5 + 11 * 16]
11914 pmulhrsw m4, m3
11915 packuswb m1, m4
11916 movu [r0 + 110 * 16], m1
11917
11918 ; mode 8 [row 15]
11919 pmaddubsw m1, m0, [r5 + 16 * 16]
11920 pmulhrsw m1, m3
11921 pmaddubsw m4, m2, [r5 + 16 * 16]
11922 pmulhrsw m4, m3
11923 packuswb m1, m4
11924 movu [r0 + 111 * 16], m1
11925
11926 ; mode 3 [row 3]
11927 movu m0, [r4 + 4]
11928 movd m1, [r4 + 20]
11929 palignr m1, m0, 1
11930 punpcklbw m0, m1
11931
11932 ; mode 17 [row 4 - second half]
11933 pmaddubsw m1, m0, [r5 + 30 * 16]
11934 pmulhrsw m1, m3
11935 packuswb m1, m1
11936 movh [r0 + 244 * 16 + 8], m1
11937 ; mode 17 [row 4 - second half] end
11938
11939 ; mode 17 [row 5 - second half]
11940 pmaddubsw m1, m0, [r5 + 4 * 16]
11941 pmulhrsw m1, m3
11942 packuswb m1, m1
11943 movh [r0 + 245 * 16 + 8], m1
11944 ; mode 17 [row 5 - second half] end
11945
11946 pmaddubsw m1, m0, [r5 + 8 * 16]
11947 pmulhrsw m1, m3
11948 movu m2, [r4 + 12]
11949 movd m4, [r4 + 28]
11950 palignr m4, m2, 1
11951 punpcklbw m2, m4
11952 pmaddubsw m4, m2, [r5 + 8 * 16]
11953 pmulhrsw m4, m3
11954 packuswb m1, m4
11955 movu [r0 + 19 * 16], m1
11956
11957 ; mode 6 [row 7]
11958 movu [r0 + 71 * 16], m1
11959
11960 ; mode 4 row [row 4]
11961 pmaddubsw m1, m0, [r5 + 9 * 16]
11962 pmulhrsw m1, m3
11963 pmaddubsw m4, m2, [r5 + 9 * 16]
11964 pmulhrsw m4, m3
11965 packuswb m1, m4
11966 movu [r0 + 36 * 16], m1
11967
11968 ; mode 4 row [row 5]
11969 pmaddubsw m1, m0, [r5 + 30 * 16]
11970 pmulhrsw m1, m3
11971 pmaddubsw m4, m2, [r5 + 30 * 16]
11972 pmulhrsw m4, m3
11973 packuswb m1, m4
11974 movu [r0 + 37 * 16], m1
11975
11976 ; mode 7 row [row 13]
11977 movu [r0 + 93 * 16], m1
11978
11979 ; mode 5 row [row 5]
11980 pmaddubsw m1, m0, [r5 + 6 * 16]
11981 pmulhrsw m1, m3
11982 pmaddubsw m4, m2, [r5 + 6 * 16]
11983 pmulhrsw m4, m3
11984 packuswb m1, m4
11985 movu [r0 + 53 * 16], m1
11986
11987 ; mode 5 row [row 6]
11988 pmaddubsw m1, m0, [r5 + 23 * 16]
11989 pmulhrsw m1, m3
11990 pmaddubsw m4, m2, [r5 + 23 * 16]
11991 pmulhrsw m4, m3
11992 packuswb m1, m4
11993 movu [r0 + 54 * 16], m1
11994
11995 ; mode 6 [row 8]
11996 pmaddubsw m1, m0, [r5 + 21 * 16]
11997 pmulhrsw m1, m3
11998 pmaddubsw m4, m2, [r5 + 21 * 16]
11999 pmulhrsw m4, m3
12000 packuswb m1, m4
12001 movu [r0 + 72 * 16], m1
12002
12003 ; mode 7 [row 12]
12004 movu [r0 + 92 * 16], m1
12005
12006 ; mode 7 [row 10]
12007 pmaddubsw m1, m0, [r5 + 3 * 16]
12008 pmulhrsw m1, m3
12009 pmaddubsw m4, m2, [r5 + 3 * 16]
12010 pmulhrsw m4, m3
12011 packuswb m1, m4
12012 movu [r0 + 90 * 16], m1
12013
12014 ; mode 7 [row 11]
12015 pmaddubsw m1, m0, [r5 + 12 * 16]
12016 pmulhrsw m1, m3
12017 pmaddubsw m4, m2, [r5 + 12 * 16]
12018 pmulhrsw m4, m3
12019 packuswb m1, m4
12020 movu [r0 + 91 * 16], m1
12021
12022 ; mode 3 [row 4]
12023 movu m0, [r4 + 5]
12024 movd m1, [r4 + 20]
12025 palignr m1, m0, 1
12026 punpcklbw m0, m1
12027
12028 ; mode 17 [row 3 - second half]
12029 pmaddubsw m1, m0, [r5 + 24 * 16]
12030 pmulhrsw m1, m3
12031 packuswb m1, m1
12032 movh [r0 + 243 * 16 + 8], m1
12033
12034 ; mode 17 [row 3 - second half] end
12035 pmaddubsw m1, m0, [r5 + 2 * 16]
12036 pmulhrsw m1, m3
12037 movu m2, [r4 + 13]
12038 movd m4, [r4 + 29]
12039 palignr m4, m2, 1
12040 punpcklbw m2, m4
12041 pmaddubsw m4, m2, [r5 + 2 * 16]
12042 pmulhrsw m4, m3
12043 packuswb m1, m4
12044 movu [r0 + 20 * 16], m1
12045
12046 ;mode 6 [row 9]
12047 movu [r0 + 73 * 16], m1
12048
12049 ; mode 4 row [row 6]
12050 movu m6, [r5 + 19 * 16]
12051 pmaddubsw m1, m0, m6
12052 pmulhrsw m1, m3
12053 pmaddubsw m4, m2, m6
12054 pmulhrsw m4, m3
12055 packuswb m1, m4
12056 movu [r0 + 38 * 16], m1
12057
12058 ; mode 3 [row 5]
12059 pmaddubsw m1, m0, [r5 + 28 * 16]
12060 pmulhrsw m1, m3
12061 pmaddubsw m4, m2, [r5 + 28 * 16]
12062 pmulhrsw m4, m3
12063 packuswb m1, m4
12064 movu [r0 + 21 * 16], m1
12065
12066 ;mode 6 [row 11]
12067 movu [r0 + 75 * 16], m1
12068
12069 ; mode 5 row [row 7]
12070 pmaddubsw m1, m0, [r5 + 8 * 16]
12071 pmulhrsw m1, m3
12072 pmaddubsw m4, m2, [r5 + 8 * 16]
12073 pmulhrsw m4, m3
12074 packuswb m1, m4
12075 movu [r0 + 55 * 16], m1
12076
12077 ; mode 5 row [row 8]
12078 pmaddubsw m1, m0, [r5 + 25 * 16]
12079 pmulhrsw m1, m3
12080 pmaddubsw m4, m2, [r5 + 25 * 16]
12081 pmulhrsw m4, m3
12082 packuswb m1, m4
12083 movu [r0 + 56 * 16], m1
12084
12085 ; mode 6 [row 10]
12086 pmaddubsw m1, m0, [r5 + 15 * 16]
12087 pmulhrsw m1, m3
12088 pmaddubsw m4, m2, [r5 + 15 * 16]
12089 pmulhrsw m4, m3
12090 packuswb m1, m4
12091 movu [r0 + 74 * 16], m1
12092
12093 ; mode 7 [row 14]
12094 pmaddubsw m1, m0, [r5 + 7 * 16]
12095 pmulhrsw m1, m3
12096 pmaddubsw m4, m2, [r5 + 7 * 16]
12097 pmulhrsw m4, m3
12098 packuswb m1, m4
12099 movu [r0 + 94 * 16], m1
12100
12101 ; mode 7 [row 15]
12102 pmaddubsw m1, m0, [r5 + 16 * 16]
12103 pmulhrsw m1, m3
12104 pmaddubsw m4, m2, [r5 + 16 * 16]
12105 pmulhrsw m4, m3
12106 packuswb m1, m4
12107 movu [r0 + 95 * 16], m1
12108
12109 ; mode 3 [row 6]
12110 movu m0, [r4 + 6]
12111 movd m1, [r4 + 22]
12112 palignr m1, m0, 1
12113 punpcklbw m0, m1
12114
12115 ; mode 17 [row 2 - second half]
12116 pmaddubsw m1, m0, [r5 + 18 * 16]
12117 pmulhrsw m1, m3
12118 packuswb m1, m1
12119 movh [r0 + 242 * 16 + 8], m1
12120 ; mode 17 [row 2 - second half] end
12121
12122 pmaddubsw m1, m0, [r5 + 22 * 16]
12123 pmulhrsw m1, m3
12124 movu m2, [r4 + 14]
12125 movd m4, [r4 + 30]
12126 palignr m4, m2, 1
12127 punpcklbw m2, m4
12128 pmaddubsw m4, m2, [r5 + 22 * 16]
12129 pmulhrsw m4, m3
12130 packuswb m1, m4
12131 movu [r0 + 22 * 16], m1
12132
12133 ; mode 6 [row 13]
12134 movu [r0 + 77 * 16], m1
12135
12136 ; mode 4 row [row 7]
12137 pmaddubsw m1, m0, [r5 + 8 * 16]
12138 pmulhrsw m1, m3
12139 pmaddubsw m4, m2, [r5 + 8 * 16]
12140 pmulhrsw m4, m3
12141 packuswb m1, m4
12142 movu [r0 + 39 * 16], m1
12143
12144 ; mode 4 row [row 8]
12145 pmaddubsw m1, m0, [r5 + 29 * 16]
12146 pmulhrsw m1, m3
12147 pmaddubsw m4, m2, [r5 + 29 * 16]
12148 pmulhrsw m4, m3
12149 packuswb m1, m4
12150 movu [r0 + 40 * 16], m1
12151
12152 ; mode 5 row [row 9]
12153 pmaddubsw m1, m0, [r5 + 10 * 16]
12154 pmulhrsw m1, m3
12155 pmaddubsw m4, m2, [r5 + 10 * 16]
12156 pmulhrsw m4, m3
12157 packuswb m1, m4
12158 movu [r0 + 57 * 16], m1
12159
12160 ; mode 5 row [row 10]
12161 pmaddubsw m1, m0, [r5 + 27 * 16]
12162 pmulhrsw m1, m3
12163 pmaddubsw m4, m2, [r5 + 27 * 16]
12164 pmulhrsw m4, m3
12165 packuswb m1, m4
12166 movu [r0 + 58 * 16], m1
12167
12168 ; mode 6 [row 12]
12169 pmaddubsw m1, m0, [r5 + 9 * 16]
12170 pmulhrsw m1, m3
12171 pmaddubsw m4, m2, [r5 + 9 * 16]
12172 pmulhrsw m4, m3
12173 packuswb m1, m4
12174 movu [r0 + 76 * 16], m1
12175
12176 ; mode 3 [row 7]
12177 movu m0, [r4 + 7]
12178 movd m1, [r4 + 27]
12179 palignr m1, m0, 1
12180 punpcklbw m0, m1
12181
12182 ; mode 17 [row 1 - second half]
12183 pmaddubsw m1, m0, [r5 + 12 * 16]
12184 pmulhrsw m1, m3
12185 packuswb m1, m1
12186 movh [r0 + 241 * 16 + 8], m1
12187 ; mode 17 [row 1 - second half] end
12188
12189 pmaddubsw m1, m0, [r5 + 16 * 16]
12190 pmulhrsw m1, m3
12191 movu m2, [r4 + 15]
12192 movd m4, [r4 + 25]
12193 palignr m4, m2, 1
12194 punpcklbw m2, m4
12195 pmaddubsw m4, m2, [r5 + 16 * 16]
12196 pmulhrsw m4, m3
12197 packuswb m1, m4
12198 movu [r0 + 23 * 16], m1
12199
12200 ; mode 6 [row 15]
12201 movu [r0 + 79 * 16], m1
12202
12203 ; mode 4 row [row 9]
12204 pmaddubsw m1, m0, [r5 + 18 * 16]
12205 pmulhrsw m1, m3
12206 pmaddubsw m4, m2, [r5 + 18 * 16]
12207 pmulhrsw m4, m3
12208 packuswb m1, m4
12209 movu [r0 + 41 * 16], m1
12210
12211 ; mode 5 row [row 11]
12212 pmaddubsw m1, m0, [r5 + 12 * 16]
12213 pmulhrsw m1, m3
12214 pmaddubsw m4, m2, [r5 + 12 * 16]
12215 pmulhrsw m4, m3
12216 packuswb m1, m4
12217 movu [r0 + 59 * 16], m1
12218
12219 ; mode 5 row [row 12]
12220 pmaddubsw m1, m0, [r5 + 29 * 16]
12221 pmulhrsw m1, m3
12222 pmaddubsw m4, m2, [r5 + 29 * 16]
12223 pmulhrsw m4, m3
12224 packuswb m1, m4
12225 movu [r0 + 60 * 16], m1
12226
12227 ; mode 6 [row 14]
12228 pmaddubsw m1, m0, [r5 + 3 * 16]
12229 pmulhrsw m1, m3
12230 pmaddubsw m4, m2, [r5 + 3 * 16]
12231 pmulhrsw m4, m3
12232 packuswb m1, m4
12233 movu [r0 + 78 * 16], m1
12234
12235 ; mode 3 [row 8]
12236 movu m0, [r4 + 8]
12237 movd m1, [r4 + 24]
12238 palignr m1, m0, 1
12239 punpcklbw m0, m1
12240 pmaddubsw m1, m0, [r5 + 10 * 16]
12241 pmulhrsw m1, m3
12242 movu m2, [r4 + 16]
12243 psrldq m4, m2, 1
12244 pinsrb m4, [r4 + 32], 15
12245 punpcklbw m2, m4
12246 pmaddubsw m4, m2, [r5 + 10 * 16]
12247 pmulhrsw m4, m3
12248 packuswb m1, m4
12249 movu [r0 + 24 * 16], m1
12250
12251 ; mode 4 row [row 10]
12252 pmaddubsw m1, m0, [r5 + 7 * 16]
12253 pmulhrsw m1, m3
12254 pmaddubsw m4, m2, [r5 + 7 * 16]
12255 pmulhrsw m4, m3
12256 packuswb m1, m4
12257 movu [r0 + 42 * 16], m1
12258
12259 ; mode 4 row [row 11]
12260 pmaddubsw m1, m0, [r5 + 28 * 16]
12261 pmulhrsw m1, m3
12262 pmaddubsw m4, m2, [r5 + 28 * 16]
12263 pmulhrsw m4, m3
12264 packuswb m1, m4
12265 movu [r0 + 43 * 16], m1
12266
12267 ; mode 5 row [row 13]
12268 pmaddubsw m1, m0, [r5 + 14 * 16]
12269 pmulhrsw m1, m3
12270 pmaddubsw m4, m2, [r5 + 14 * 16]
12271 pmulhrsw m4, m3
12272 packuswb m1, m4
12273 movu [r0 + 61 * 16], m1
12274
12275 ; mode 5 row [row 14]
12276 pmaddubsw m1, m0, [r5 + 31 * 16]
12277 pmulhrsw m1, m3
12278 pmaddubsw m4, m2, [r5 + 31 * 16]
12279 pmulhrsw m4, m3
12280 packuswb m1, m4
12281 movu [r0 + 62 * 16], m1
12282
12283 ; mode 3 [row 9]
12284 movu m0, [r4 + 9]
12285 movd m1, [r4 + 16]
12286 palignr m1, m0, 1
12287 punpcklbw m0, m1
12288 pmaddubsw m1, m0, [r5 + 4 * 16]
12289 pmulhrsw m1, m3
12290 movu m2, [r4 + 17]
12291 movd m4, [r4 + 33]
12292 palignr m4, m2, 1
12293 punpcklbw m2, m4
12294 pmaddubsw m4, m2, [r5 + 4 * 16]
12295 pmulhrsw m4, m3
12296 packuswb m1, m4
12297 movu [r0 + 25 * 16], m1
12298
12299 ; mode 4 row [row 12]
12300 pmaddubsw m1, m0, [r5 + 17 * 16]
12301 pmulhrsw m1, m3
12302 pmaddubsw m4, m2, [r5 + 17 * 16]
12303 pmulhrsw m4, m3
12304 packuswb m1, m4
12305 movu [r0 + 44 * 16], m1
12306
12307 ; mode 3 [row 10]
12308 pmaddubsw m1, m0, [r5 + 30 * 16]
12309 pmulhrsw m1, m3
12310 pmaddubsw m4, m2, [r5 + 30 * 16]
12311 pmulhrsw m4, m3
12312 packuswb m1, m4
12313 movu [r0 + 26 * 16], m1
12314
12315 ; mode 5 row [row 15]
12316 pmaddubsw m1, m0, [r5 + 16 * 16]
12317 pmulhrsw m1, m3
12318 pmaddubsw m4, m2, [r5 + 16 * 16]
12319 pmulhrsw m4, m3
12320 packuswb m1, m4
12321 movu [r0 + 63 * 16], m1
12322
12323 ; mode 3 [row 11]
12324 movu m0, [r4 + 10]
12325 movd m1, [r4 + 26]
12326 palignr m1, m0, 1
12327 punpcklbw m0, m1
12328 pmaddubsw m1, m0, [r5 + 24 * 16]
12329 pmulhrsw m1, m3
12330 movu m2, [r4 + 18]
12331 movd m4, [r4 + 34]
12332 palignr m4, m2, 1
12333 punpcklbw m2, m4
12334 pmaddubsw m4, m2, [r5 + 24 * 16]
12335 pmulhrsw m4, m3
12336 packuswb m1, m4
12337 movu [r0 + 27 * 16], m1
12338
12339 ; mode 4 row [row 13]
12340 pmaddubsw m1, m0, [r5 + 6 * 16]
12341 pmulhrsw m1, m3
12342 pmaddubsw m4, m2, [r5 + 6 * 16]
12343 pmulhrsw m4, m3
12344 packuswb m1, m4
12345 movu [r0 + 45 * 16], m1
12346
12347 ; mode 4 row [row 14]
12348 pmaddubsw m1, m0, [r5 + 27 * 16]
12349 pmulhrsw m1, m3
12350 pmaddubsw m4, m2, [r5 + 27 * 16]
12351 pmulhrsw m4, m3
12352 packuswb m1, m4
12353 movu [r0 + 46 * 16], m1
12354
12355 ; mode 3 [row 12]
12356 movu m0, [r4 + 11]
12357 movd m1, [r4 + 27]
12358 palignr m1, m0, 1
12359 punpcklbw m0, m1
12360 pmaddubsw m1, m0, [r5 + 18 * 16]
12361 pmulhrsw m1, m3
12362 movu m2, [r4 + 19]
12363 movd m4, [r4 + 35]
12364 palignr m4, m2, 1
12365 punpcklbw m2, m4
12366 pmaddubsw m4, m2, [r5 + 18 * 16]
12367 pmulhrsw m4, m3
12368 packuswb m1, m4
12369 movu [r0 + 28 * 16], m1
12370
12371 ; mode 4 row [row 15]
12372 pmaddubsw m1, m0, [r5 + 16 * 16]
12373 pmulhrsw m1, m3
12374 pmaddubsw m4, m2, [r5 + 16 * 16]
12375 pmulhrsw m4, m3
12376 packuswb m1, m4
12377 movu [r0 + 47 * 16], m1
12378
12379 ; mode 3 [row 13]
12380 movu m0, [r4 + 12]
12381 movd m1, [r4 + 28]
12382 palignr m1, m0, 1
12383 punpcklbw m0, m1
12384 pmaddubsw m1, m0, [r5 + 12 * 16]
12385 pmulhrsw m1, m3
12386 movu m2, [r4 + 20]
12387 movd m4, [r4 + 36]
12388 palignr m4, m2, 1
12389 punpcklbw m2, m4
12390 pmaddubsw m4, m2, [r5 + 12 * 16]
12391 pmulhrsw m4, m3
12392 packuswb m1, m4
12393 movu [r0 + 29 * 16], m1
12394
12395 ; mode 3 [row 14]
12396 movu m0, [r4 + 13]
12397 movd m1, [r4 + 29]
12398 palignr m1, m0, 1
12399 punpcklbw m0, m1
12400 pmaddubsw m1, m0, [r5 + 6 * 16]
12401 pmulhrsw m1, m3
12402 movu m2, [r4 + 21]
12403 movd m4, [r4 + 37]
12404 palignr m4, m2, 1
12405 punpcklbw m2, m4
12406 pmaddubsw m4, m2, [r5 + 6 * 16]
12407 pmulhrsw m4, m3
12408 packuswb m1, m4
12409 movu [r0 + 30 * 16], m1
12410
12411 ; mode 9
12412 movu m0, [r2 + 1]
12413 movd m1, [r2 + 17]
12414 palignr m1, m0, 1
12415
12416 ; mode 9 [row 15]
12417 movu [r0 + 127 * 16], m1
12418
12419 ; mode 9 [row 0]
12420 punpcklbw m0, m1
12421 pmaddubsw m1, m0, [r5 + 2 * 16]
12422 pmulhrsw m1, m3
12423 movu m7, [r2 + 9]
12424 movd m4, [r4 + 25]
12425 palignr m2, m7, 1
12426 punpcklbw m7, m2
12427 pmaddubsw m2, m7, [r5 + 2 * 16]
12428 pmulhrsw m2, m3
12429 packuswb m1, m2
12430 movu [r0 + 112 * 16], m1
12431
12432 ; mode 9 [row 1]
12433 pmaddubsw m1, m0, [r5 + 4 * 16]
12434 pmulhrsw m1, m3
12435 pmaddubsw m2, m7, [r5 + 4 * 16]
12436 pmulhrsw m2, m3
12437 packuswb m1, m2
12438 movu [r0 + 113 * 16], m1
12439
12440 ; mode 9 [row 2]
12441 pmaddubsw m1, m0, [r5 + 6 * 16]
12442 pmulhrsw m1, m3
12443 pmaddubsw m2, m7, [r5 + 6 * 16]
12444 pmulhrsw m2, m3
12445 packuswb m1, m2
12446 movu [r0 + 114 * 16], m1
12447
12448 ; mode 9 [row 3]
12449 pmaddubsw m1, m0, [r5 + 8 * 16]
12450 pmulhrsw m1, m3
12451 pmaddubsw m2, m7, [r5 + 8 * 16]
12452 pmulhrsw m2, m3
12453 packuswb m1, m2
12454 movu [r0 + 115 * 16], m1
12455
12456 ; mode 9 [row 4]
12457 pmaddubsw m1, m0, [r5 + 10 * 16]
12458 pmulhrsw m1, m3
12459 pmaddubsw m2, m7, [r5 + 10 * 16]
12460 pmulhrsw m2, m3
12461 packuswb m1, m2
12462 movu [r0 + 116 * 16], m1
12463
12464 ; mode 9 [row 5]
12465 pmaddubsw m1, m0, [r5 + 12 * 16]
12466 pmulhrsw m1, m3
12467 pmaddubsw m2, m7, [r5 + 12 * 16]
12468 pmulhrsw m2, m3
12469 packuswb m1, m2
12470 movu [r0 + 117 * 16], m1
12471
12472 ; mode 9 [row 6]
12473 pmaddubsw m1, m0, [r5 + 14 * 16]
12474 pmulhrsw m1, m3
12475 pmaddubsw m2, m7, [r5 + 14 * 16]
12476 pmulhrsw m2, m3
12477 packuswb m1, m2
12478 movu [r0 + 118 * 16], m1
12479
12480 ; mode 9 [row 7]
12481 pmaddubsw m1, m0, [r5 + 16 * 16]
12482 pmulhrsw m1, m3
12483 pmaddubsw m2, m7, [r5 + 16 * 16]
12484 pmulhrsw m2, m3
12485 packuswb m1, m2
12486 movu [r0 + 119 * 16], m1
12487
12488 ; mode 9 [row 8]
12489 pmaddubsw m1, m0, [r5 + 18 * 16]
12490 pmulhrsw m1, m3
12491 pmaddubsw m2, m7, [r5 + 18 * 16]
12492 pmulhrsw m2, m3
12493 packuswb m1, m2
12494 movu [r0 + 120 * 16], m1
12495
12496 ; mode 9 [row 9]
12497 pmaddubsw m1, m0, [r5 + 20 * 16]
12498 pmulhrsw m1, m3
12499 pmaddubsw m2, m7, [r5 + 20 * 16]
12500 pmulhrsw m2, m3
12501 packuswb m1, m2
12502 movu [r0 + 121 * 16], m1
12503
12504 ; mode 9 [row 10]
12505 pmaddubsw m1, m0, [r5 + 22 * 16]
12506 pmulhrsw m1, m3
12507 pmaddubsw m2, m7, [r5 + 22 * 16]
12508 pmulhrsw m2, m3
12509 packuswb m1, m2
12510 movu [r0 + 122 * 16], m1
12511
12512 ; mode 9 [row 11]
12513 pmaddubsw m1, m0, [r5 + 24 * 16]
12514 pmulhrsw m1, m3
12515 pmaddubsw m2, m7, [r5 + 24 * 16]
12516 pmulhrsw m2, m3
12517 packuswb m1, m2
12518 movu [r0 + 123 * 16], m1
12519
12520 ; mode 9 [row 12]
12521 pmaddubsw m1, m0, [r5 + 26 * 16]
12522 pmulhrsw m1, m3
12523 pmaddubsw m2, m7, [r5 + 26 * 16]
12524 pmulhrsw m2, m3
12525 packuswb m1, m2
12526 movu [r0 + 124 * 16], m1
12527
12528 ; mode 9 [row 13]
12529 pmaddubsw m1, m0, [r5 + 28 * 16]
12530 pmulhrsw m1, m3
12531 pmaddubsw m2, m7, [r5 + 28 * 16]
12532 pmulhrsw m2, m3
12533 packuswb m1, m2
12534 movu [r0 + 125 * 16], m1
12535
12536 ; mode 9 [row 14]
12537 pmaddubsw m1, m0, [r5 + 30 * 16]
12538 pmulhrsw m1, m3
12539 pmaddubsw m2, m7, [r5 + 30 * 16]
12540 pmulhrsw m2, m3
12541 packuswb m1, m2
12542 movu [r0 + 126 * 16], m1
12543
12544 ; mode 10
12545 movu m1, [r2 + 1]
12546 movu [r0 + 128 * 16], m1
12547 movu [r0 + 129 * 16], m1
12548 movu [r0 + 130 * 16], m1
12549 movu [r0 + 131 * 16], m1
12550 movu [r0 + 132 * 16], m1
12551 movu [r0 + 133 * 16], m1
12552 movu [r0 + 134 * 16], m1
12553 movu [r0 + 135 * 16], m1
12554 movu [r0 + 136 * 16], m1
12555 movu [r0 + 137 * 16], m1
12556 movu [r0 + 138 * 16], m1
12557 movu [r0 + 139 * 16], m1
12558 movu [r0 + 140 * 16], m1
12559 movu [r0 + 141 * 16], m1
12560 movu [r0 + 142 * 16], m1
12561 movu [r0 + 143 * 16], m1
12562
12563 pxor m0, m0
12564 pshufb m1, m1, m0
12565 punpcklbw m1, m0
12566 movu m2, [r1]
12567 pshufb m2, m2, m0
12568 punpcklbw m2, m0
12569 movu m4, [r1 + 1]
12570 punpcklbw m5, m4, m0
12571 punpckhbw m4, m0
12572 psubw m5, m2
12573 psubw m4, m2
12574 psraw m5, 1
12575 psraw m4, 1
12576 paddw m5, m1
12577 paddw m4, m1
12578 packuswb m5, m4
12579
12580 pextrb [r0 + 128 * 16], m5, 0
12581 pextrb [r0 + 129 * 16], m5, 1
12582 pextrb [r0 + 130 * 16], m5, 2
12583 pextrb [r0 + 131 * 16], m5, 3
12584 pextrb [r0 + 132 * 16], m5, 4
12585 pextrb [r0 + 133 * 16], m5, 5
12586 pextrb [r0 + 134 * 16], m5, 6
12587 pextrb [r0 + 135 * 16], m5, 7
12588 pextrb [r0 + 136 * 16], m5, 8
12589 pextrb [r0 + 137 * 16], m5, 9
12590 pextrb [r0 + 138 * 16], m5, 10
12591 pextrb [r0 + 139 * 16], m5, 11
12592 pextrb [r0 + 140 * 16], m5, 12
12593 pextrb [r0 + 141 * 16], m5, 13
12594 pextrb [r0 + 142 * 16], m5, 14
12595 pextrb [r0 + 143 * 16], m5, 15
12596
12597 ; mode 11
12598 movu m0, [r2]
12599
12600 ; mode 11 [row 15]
12601 movu [r0 + 159 * 16], m0
12602
12603 ; mode 11 [row 0]
12604 movu m1, [r2 + 1]
12605 punpcklbw m0, m1
12606 pmaddubsw m1, m0, [r5 + 30 * 16]
12607 pmulhrsw m1, m3
12608 movu m7, [r2 + 8]
12609 movu m2, [r2 + 9]
12610 punpcklbw m7, m2
12611 pmaddubsw m2, m7, [r5 + 30 * 16]
12612 pmulhrsw m2, m3
12613 packuswb m1, m2
12614 movu [r0 + 144 * 16], m1
12615
12616 ; mode 11 [row 1]
12617 pmaddubsw m1, m0, [r5 + 28 * 16]
12618 pmulhrsw m1, m3
12619 pmaddubsw m2, m7, [r5 + 28 * 16]
12620 pmulhrsw m2, m3
12621 packuswb m1, m2
12622 movu [r0 + 145 * 16], m1
12623
12624 ; mode 11 [row 2]
12625 pmaddubsw m1, m0, [r5 + 26 * 16]
12626 pmulhrsw m1, m3
12627 pmaddubsw m2, m7, [r5 + 26 * 16]
12628 pmulhrsw m2, m3
12629 packuswb m1, m2
12630 movu [r0 + 146 * 16], m1
12631
12632 ; mode 11 [row 3]
12633 pmaddubsw m1, m0, [r5 + 24 * 16]
12634 pmulhrsw m1, m3
12635 pmaddubsw m2, m7, [r5 + 24 * 16]
12636 pmulhrsw m2, m3
12637 packuswb m1, m2
12638 movu [r0 + 147 * 16], m1
12639
12640 ; mode 11 [row 4]
12641 pmaddubsw m1, m0, [r5 + 22 * 16]
12642 pmulhrsw m1, m3
12643 pmaddubsw m2, m7, [r5 + 22 * 16]
12644 pmulhrsw m2, m3
12645 packuswb m1, m2
12646 movu [r0 + 148 * 16], m1
12647
12648 ; mode 11 [row 5]
12649 pmaddubsw m1, m0, [r5 + 20 * 16]
12650 pmulhrsw m1, m3
12651 pmaddubsw m2, m7, [r5 + 20 * 16]
12652 pmulhrsw m2, m3
12653 packuswb m1, m2
12654 movu [r0 + 149 * 16], m1
12655
12656 ; mode 11 [row 6]
12657 pmaddubsw m1, m0, [r5 + 18 * 16]
12658 pmulhrsw m1, m3
12659 pmaddubsw m2, m7, [r5 + 18 * 16]
12660 pmulhrsw m2, m3
12661 packuswb m1, m2
12662 movu [r0 + 150 * 16], m1
12663
12664 ; mode 11 [row 7]
12665 pmaddubsw m1, m0, [r5 + 16 * 16]
12666 pmulhrsw m1, m3
12667 pmaddubsw m2, m7, [r5 + 16 * 16]
12668 pmulhrsw m2, m3
12669 packuswb m1, m2
12670 movu [r0 + 151 * 16], m1
12671
12672 ; mode 11 [row 8]
12673 pmaddubsw m1, m0, [r5 + 14 * 16]
12674 pmulhrsw m1, m3
12675 pmaddubsw m2, m7, [r5 + 14 * 16]
12676 pmulhrsw m2, m3
12677 packuswb m1, m2
12678 movu [r0 + 152 * 16], m1
12679
12680 ; mode 11 [row 9]
12681 pmaddubsw m1, m0, [r5 + 12 * 16]
12682 pmulhrsw m1, m3
12683 pmaddubsw m2, m7, [r5 + 12 * 16]
12684 pmulhrsw m2, m3
12685 packuswb m1, m2
12686 movu [r0 + 153 * 16], m1
12687
12688 ; mode 11 [row 10]
12689 pmaddubsw m1, m0, [r5 + 10 * 16]
12690 pmulhrsw m1, m3
12691 pmaddubsw m2, m7, [r5 + 10 * 16]
12692 pmulhrsw m2, m3
12693 packuswb m1, m2
12694 movu [r0 + 154 * 16], m1
12695
12696 ; mode 11 [row 11]
12697 pmaddubsw m1, m0, [r5 + 8 * 16]
12698 pmulhrsw m1, m3
12699 pmaddubsw m2, m7, [r5 + 8 * 16]
12700 pmulhrsw m2, m3
12701 packuswb m1, m2
12702 movu [r0 + 155 * 16], m1
12703
12704 ; mode 11 [row 12]
12705 pmaddubsw m1, m0, [r5 + 6 * 16]
12706 pmulhrsw m1, m3
12707 pmaddubsw m2, m7, [r5 + 6 * 16]
12708 pmulhrsw m2, m3
12709 packuswb m1, m2
12710 movu [r0 + 156 * 16], m1
12711
12712 ; mode 11 [row 13]
12713 pmaddubsw m1, m0, [r5 + 4 * 16]
12714 pmulhrsw m1, m3
12715 pmaddubsw m2, m7, [r5 + 4 * 16]
12716 pmulhrsw m2, m3
12717 packuswb m1, m2
12718 movu [r0 + 157 * 16], m1
12719
12720 ; mode 11 [row 14]
12721 pmaddubsw m1, m0, [r5 + 2 * 16]
12722 pmulhrsw m1, m3
12723 pmaddubsw m2, m7, [r5 + 2 * 16]
12724 pmulhrsw m2, m3
12725 packuswb m1, m2
12726 movu [r0 + 158 * 16], m1
12727
12728 ; mode 12 [row 0]
12729 movu m0, [r4]
12730 movu m1, [r4 + 1]
12731 punpcklbw m0, m1
12732 pmaddubsw m1, m0, [r5 + 27 * 16]
12733 pmulhrsw m1, m3
12734 movu m7, [r4 + 8]
12735 movd m2, [r4 + 24]
12736 palignr m2, m7, 1
12737 punpcklbw m7, m2
12738 pmaddubsw m2, m7, [r5 + 27 * 16]
12739 pmulhrsw m2, m3
12740 packuswb m1, m2
12741 movu [r0 + 160 * 16], m1
12742
12743 ; mode 12 [row 1]
12744 pmaddubsw m1, m0, [r5 + 22 * 16]
12745 pmulhrsw m1, m3
12746 pmaddubsw m2, m7, [r5 + 22 * 16]
12747 pmulhrsw m2, m3
12748 packuswb m1, m2
12749 movu [r0 + 161 * 16], m1
12750
12751 ; mode 12 [row 2]
12752 pmaddubsw m1, m0, [r5 + 17 * 16]
12753 pmulhrsw m1, m3
12754 pmaddubsw m2, m7, [r5 + 17 * 16]
12755 pmulhrsw m2, m3
12756 packuswb m1, m2
12757 movu [r0 + 162 * 16], m1
12758
12759 ; mode 12 [row 3]
12760 pmaddubsw m1, m0, [r5 + 12 * 16]
12761 pmulhrsw m1, m3
12762 pmaddubsw m2, m7, [r5 + 12 * 16]
12763 pmulhrsw m2, m3
12764 packuswb m1, m2
12765 movu [r0 + 163 * 16], m1
12766
12767 ; mode 12 [row 4]
12768 pmaddubsw m1, m0, [r5 + 7 * 16]
12769 pmulhrsw m1, m3
12770 pmaddubsw m2, m7, [r5 + 7 * 16]
12771 pmulhrsw m2, m3
12772 packuswb m1, m2
12773 movu [r0 + 164 * 16], m1
12774
12775 ; mode 12 [row 5]
12776 pmaddubsw m1, m0, [r5 + 2 * 16]
12777 pmulhrsw m1, m3
12778 pmaddubsw m2, m7, [r5 + 2 * 16]
12779 pmulhrsw m2, m3
12780 packuswb m1, m2
12781 movu [r0 + 165 * 16], m1
12782
12783 ; mode 13 [row 0]
12784 pmaddubsw m1, m0, [r5 + 23 * 16]
12785 pmulhrsw m1, m3
12786 pmaddubsw m2, m7, [r5 + 23 * 16]
12787 pmulhrsw m2, m3
12788 packuswb m1, m2
12789 movu [r0 + 176 * 16], m1
12790
12791 ; mode 13 [row 1]
12792 pmaddubsw m1, m0, [r5 + 14 * 16]
12793 pmulhrsw m1, m3
12794 pmaddubsw m2, m7, [r5 + 14 * 16]
12795 pmulhrsw m2, m3
12796 packuswb m1, m2
12797 movu [r0 + 177 * 16], m1
12798
12799 ; mode 13 [row 2]
12800 pmaddubsw m1, m0, [r5 + 5 * 16]
12801 pmulhrsw m1, m3
12802 pmaddubsw m2, m7, [r5 + 5 * 16]
12803 pmulhrsw m2, m3
12804 packuswb m1, m2
12805 movu [r0 + 178 * 16], m1
12806
12807 ; mode 14 [row 0]
12808 pmaddubsw m1, m0, [r5 + 19 * 16]
12809 pmulhrsw m1, m3
12810 pmaddubsw m2, m7, [r5 + 19 * 16]
12811 pmulhrsw m2, m3
12812 packuswb m1, m2
12813 movu [r0 + 192 * 16], m1
12814
12815 ; mode 14 [row 1]
12816 pmaddubsw m1, m0, [r5 + 6 * 16]
12817 pmulhrsw m1, m3
12818 pmaddubsw m2, m7, [r5 + 6 * 16]
12819 pmulhrsw m2, m3
12820 packuswb m1, m2
12821 movu [r0 + 193 * 16], m1
12822
12823 ; mode 17 [row 0]
12824 movu [r0 + 240 * 16], m1
12825
12826 ; mode 15 [row 0]
12827 pmaddubsw m1, m0, [r5 + 15 * 16]
12828 pmulhrsw m1, m3
12829 pmaddubsw m2, m7, [r5 + 15 * 16]
12830 pmulhrsw m2, m3
12831 packuswb m1, m2
12832 movu [r0 + 208 * 16], m1
12833
12834 ; mode 15 [row 15 - second half]
12835 pmaddubsw m1, m0, [r5 + 16 * 16]
12836 pmulhrsw m1, m3
12837 packuswb m1, m1
12838 movh [r0 + 223 * 16 + 8], m1
12839 ; mode 15 [row 15 - second half] end
12840
12841 ; mode 16 [row 0]
12842 pmaddubsw m1, m0, [r5 + 11 * 16]
12843 pmulhrsw m1, m3
12844 pmaddubsw m2, m7, [r5 + 11 * 16]
12845 pmulhrsw m2, m3
12846 packuswb m1, m2
12847 movu [r0 + 224 * 16], m1
12848
12849 ; mode 17 [row 9 - second half]
12850 pmaddubsw m1, m0, [r5 + 28 * 16]
12851 pmulhrsw m1, m3
12852 packuswb m1, m1
12853 movh [r0 + 249 * 16 + 8], m1
12854 ; mode 17 [row 9 - second half] end
12855
12856 ; mode 17 [row 10 - second half]
12857 pmaddubsw m1, m0, [r5 + 2 * 16]
12858 pmulhrsw m1, m3
12859 packuswb m1, m1
12860 movh [r0 + 250 * 16 + 8], m1
12861 ; mode 17 [row 10 - second half] end
12862
12863 ; mode 17 [row 1 - first half]
12864 pslldq m6, m0, 2
12865 pinsrb m6, [r3 + 0], 1
12866 pinsrb m6, [r3 + 1], 0
12867 pmaddubsw m1, m6, [r5 + 12 * 16]
12868 pmulhrsw m1, m3
12869 packuswb m1, m1
12870 movh [r0 + 241 * 16], m1
12871
12872 ; mode 17 [row 11 - second half]
12873 pmaddubsw m1, m6, [r5 + 8 * 16]
12874 pmulhrsw m1, m3
12875 packuswb m1, m1
12876 movh [r0 + 251 * 16 + 8], m1
12877 ; mode 17 [row 11 - second half] end
12878
12879 ; mode 17 [row 2 - first half]
12880 pslldq m6, 2
12881 pinsrb m6, [r3 + 1], 1
12882 pinsrb m6, [r3 + 2], 0
12883 pmaddubsw m1, m6, [r5 + 18 * 16]
12884 pmulhrsw m1, m3
12885 packuswb m1, m1
12886 movh [r0 + 242 * 16], m1
12887
12888 ; mode 17 [row 12 - second half]
12889 pmaddubsw m1, m6, [r5 + 14 * 16]
12890 pmulhrsw m1, m3
12891 packuswb m1, m1
12892 movh [r0 + 252 * 16 + 8], m1
12893 ; mode 17 [row 12 - second half] end
12894
12895 ; mode 17 [row 3 - first half]
12896 pslldq m6, 2
12897 pinsrb m6, [r3 + 2], 1
12898 pinsrb m6, [r3 + 4], 0
12899 pmaddubsw m1, m6, [r5 + 24 * 16]
12900 pmulhrsw m1, m3
12901 packuswb m1, m1
12902 movh [r0 + 243 * 16], m1
12903
12904 ; mode 17 [row 13 - first half]
12905 pmaddubsw m1, m6, [r5 + 20 * 16]
12906 pmulhrsw m1, m3
12907 packuswb m1, m1
12908 movh [r0 + 253 * 16 + 8], m1
12909
12910 ; mode 17 [row 4 - first half]
12911 pslldq m6, 2
12912 pinsrb m6, [r3 + 4], 1
12913 pinsrb m6, [r3 + 5], 0
12914 pmaddubsw m1, m6, [r5 + 30 * 16]
12915 pmulhrsw m1, m3
12916 packuswb m1, m1
12917 movh [r0 + 244 * 16], m1
12918
12919 ; mode 17 [row 5 - first half]
12920 pmaddubsw m1, m6, [r5 + 4 * 16]
12921 pmulhrsw m1, m3
12922 packuswb m1, m1
12923 movh [r0 + 245 * 16], m1
12924
12925 ; mode 17 [row 14 - second half]
12926 pmaddubsw m1, m6, [r5 + 26 * 16]
12927 pmulhrsw m1, m3
12928 packuswb m1, m1
12929 movh [r0 + 254 * 16 + 8], m1
12930 ; mode 17 [row 14 - second half] end
12931
12932 ; mode 17 [row 6 - first half]
12933 pslldq m6, 2
12934 pinsrb m6, [r3 + 5], 1
12935 pinsrb m6, [r3 + 6], 0
12936 pmaddubsw m1, m6, [r5 + 10 * 16]
12937 pmulhrsw m1, m3
12938 packuswb m1, m1
12939 movh [r0 + 246 * 16], m1
12940
12941 ; mode 17 [row 7 - first half]
12942 pslldq m6, 2
12943 pinsrb m6, [r3 + 6], 1
12944 pinsrb m6, [r3 + 7], 0
12945 pmaddubsw m1, m6, [r5 + 16 * 16]
12946 pmulhrsw m1, m3
12947 packuswb m1, m1
12948 movh [r0 + 247 * 16], m1
12949
12950 ; mode 17 [row 8 - first half]
12951 pslldq m6, 2
12952 pinsrb m6, [r3 + 7], 1
12953 pinsrb m6, [r3 + 9], 0
12954 pmaddubsw m1, m6, [r5 + 22 * 16]
12955 pmulhrsw m1, m3
12956 packuswb m1, m1
12957 movh [r0 + 248 * 16], m1
12958
12959 ; mode 17 [row 9 - first half]
12960 pslldq m6, 2
12961 pinsrb m6, [r3 + 9], 1
12962 pinsrb m6, [r3 + 10], 0
12963 pmaddubsw m1, m6, [r5 + 28 * 16]
12964 pmulhrsw m1, m3
12965 packuswb m1, m1
12966 movh [r0 + 249 * 16], m1
12967
12968 ; mode 17 [row 10 - first half]
12969 pmaddubsw m1, m6, [r5 + 2 * 16]
12970 pmulhrsw m1, m3
12971 packuswb m1, m1
12972 movh [r0 + 250 * 16], m1
12973
12974 ; mode 17 [row 11 - first half]
12975 pslldq m6, 2
12976 pinsrb m6, [r3 + 10], 1
12977 pinsrb m6, [r3 + 11], 0
12978 pmaddubsw m1, m6, [r5 + 8 * 16]
12979 pmulhrsw m1, m3
12980 packuswb m1, m1
12981 movh [r0 + 251 * 16], m1
12982
12983 ; mode 17 [row 12 - first half]
12984 pslldq m6, 2
12985 pinsrb m6, [r3 + 11], 1
12986 pinsrb m6, [r3 + 12], 0
12987 pmaddubsw m1, m6, [r5 + 14 * 16]
12988 pmulhrsw m1, m3
12989 packuswb m1, m1
12990 movh [r0 + 252 * 16], m1
12991
12992 ; mode 17 [row 13 - first half]
12993 pslldq m6, 2
12994 pinsrb m6, [r3 + 12], 1
12995 pinsrb m6, [r3 + 14], 0
12996 pmaddubsw m1, m6, [r5 + 20 * 16]
12997 pmulhrsw m1, m3
12998 packuswb m1, m1
12999 movh [r0 + 253 * 16], m1
13000
13001 ; mode 17 [row 14 - first half]
13002 pslldq m6, 2
13003 pinsrb m6, [r3 + 14], 1
13004 pinsrb m6, [r3 + 15], 0
13005 pmaddubsw m1, m6, [r5 + 26 * 16]
13006 pmulhrsw m1, m3
13007 packuswb m1, m1
13008 movh [r0 + 254 * 16], m1
13009
13010 ; mode 16 [row 12 - second half]
13011 pmaddubsw m1, m0, [r5 + 15 * 16]
13012 pmulhrsw m1, m3
13013 packuswb m1, m1
13014 movh [r0 + 236 * 16 + 8], m1
13015 ; mode 16 [row 12 - second half]
13016
13017 ; mode 12 [row 6]
13018 pslldq m2, m0, 2
13019 pinsrb m2, [r3 + 0], 1
13020 pinsrb m2, [r3 + 6], 0
13021 pmaddubsw m1, m2, [r5 + 29 * 16]
13022 pmulhrsw m1, m3
13023 movu m0, [r4 + 7]
13024 psrldq m4, m0, 1
13025 punpcklbw m0, m4
13026 pmaddubsw m4, m0, [r5 + 29 * 16]
13027 pmulhrsw m4, m3
13028 packuswb m1, m4
13029 movu [r0 + 166 * 16], m1
13030
13031 ; mode 12 [row 7]
13032 pmaddubsw m1, m2, [r5 + 24 * 16]
13033 pmulhrsw m1, m3
13034 pmaddubsw m4, m0, [r5 + 24 * 16]
13035 pmulhrsw m4, m3
13036 packuswb m1, m4
13037 movu [r0 + 167 * 16], m1
13038
13039 ; mode 12 [row 8]
13040 pmaddubsw m1, m2, [r5 + 19 * 16]
13041 pmulhrsw m1, m3
13042 pmaddubsw m4, m0, [r5 + 19 * 16]
13043 pmulhrsw m4, m3
13044 packuswb m1, m4
13045 movu [r0 + 168 * 16], m1
13046
13047 ; mode 12 [row 9]
13048 pmaddubsw m1, m2, [r5 + 14 * 16]
13049 pmulhrsw m1, m3
13050 pmaddubsw m4, m0, [r5 + 14 * 16]
13051 pmulhrsw m4, m3
13052 packuswb m1, m4
13053 movu [r0 + 169 * 16], m1
13054
13055 ; mode 12 [row 10]
13056 pmaddubsw m1, m2, [r5 + 9 * 16]
13057 pmulhrsw m1, m3
13058 pmaddubsw m4, m0, [r5 + 9 * 16]
13059 pmulhrsw m4, m3
13060 packuswb m1, m4
13061 movu [r0 + 170 * 16], m1
13062
13063 ; mode 12 [row 11]
13064 pmaddubsw m1, m2, [r5 + 4 * 16]
13065 pmulhrsw m1, m3
13066 pmaddubsw m4, m0, [r5 + 4 * 16]
13067 pmulhrsw m4, m3
13068 packuswb m1, m4
13069 movu [r0 + 171 * 16], m1
13070
13071 ; mode 13 [row 3]
13072 pinsrb m7, m2, [r3 + 4], 0
13073 pmaddubsw m1, m7, [r5 + 28 * 16]
13074 pmulhrsw m1, m3
13075 pmaddubsw m4, m0, [r5 + 28 * 16]
13076 pmulhrsw m4, m3
13077 packuswb m1, m4
13078 movu [r0 + 179 * 16], m1
13079
13080 ; mode 13 [row 4]
13081 pmaddubsw m1, m7, [r5 + 19 * 16]
13082 pmulhrsw m1, m3
13083 pmaddubsw m4, m0, [r5 + 19 * 16]
13084 pmulhrsw m4, m3
13085 packuswb m1, m4
13086 movu [r0 + 180 * 16], m1
13087
13088 ; mode 13 [row 5]
13089 pmaddubsw m1, m7, [r5 + 10 * 16]
13090 pmulhrsw m1, m3
13091 pmaddubsw m4, m0, [r5 + 10 * 16]
13092 pmulhrsw m4, m3
13093 packuswb m1, m4
13094 movu [r0 + 181 * 16], m1
13095
13096 ; mode 13 [row 6]
13097 pmaddubsw m1, m7, [r5 + 1 * 16]
13098 pmulhrsw m1, m3
13099 pmaddubsw m4, m0, [r5 + 1 * 16]
13100 pmulhrsw m4, m3
13101 packuswb m1, m4
13102 movu [r0 + 182 * 16], m1
13103
13104 ; mode 14 [row 2]
13105 pinsrb m5, m7, [r3 + 2], 0
13106 pmaddubsw m1, m5, [r5 + 25 * 16]
13107 pmulhrsw m1, m3
13108 pmaddubsw m4, m0, [r5 + 25 * 16]
13109 pmulhrsw m4, m3
13110 packuswb m1, m4
13111 movu [r0 + 194 * 16], m1
13112
13113 ; mode 14 [row 3]
13114 pmaddubsw m1, m5, [r5 + 12 * 16]
13115 pmulhrsw m1, m3
13116 pmaddubsw m4, m0, [r5 + 12 * 16]
13117 pmulhrsw m4, m3
13118 packuswb m1, m4
13119 movu [r0 + 195 * 16], m1
13120
13121 ; mode 15 [row 1]
13122 pmaddubsw m1, m5, [r5 + 30 * 16]
13123 pmulhrsw m1, m3
13124 pmaddubsw m4, m0, [r5 + 30 * 16]
13125 pmulhrsw m4, m3
13126 packuswb m1, m4
13127 movu [r0 + 209 * 16], m1
13128
13129 ; mode 15 [row 2]
13130 pmaddubsw m1, m5, [r5 + 13 * 16]
13131 pmulhrsw m1, m3
13132 pmaddubsw m4, m0, [r5 + 13 * 16]
13133 pmulhrsw m4, m3
13134 packuswb m1, m4
13135 movu [r0 + 210 * 16], m1
13136
13137 ; mode 16 [row 1]
13138 pmaddubsw m1, m5, [r5 + 22 * 16]
13139 pmulhrsw m1, m3
13140 pmaddubsw m4, m0, [r5 + 22 * 16]
13141 pmulhrsw m4, m3
13142 packuswb m1, m4
13143 movu [r0 + 225 * 16], m1
13144
13145 ; mode 16 [row 2]
13146 pmaddubsw m1, m5, [r5 + 1 * 16]
13147 pmulhrsw m1, m3
13148 pmaddubsw m4, m0, [r5 + 1 * 16]
13149 pmulhrsw m4, m3
13150 packuswb m1, m4
13151 movu [r0 + 226 * 16], m1
13152
13153 ; mode 16 [row 13 - second half]
13154 pmaddubsw m1, m5, [r5 + 26 * 16]
13155 pmulhrsw m1, m3
13156 packuswb m1, m1
13157 movh [r0 + 237 * 16 + 8], m1
13158 ; mode 16 [row 13 - second half]
13159
13160 ; mode 16 [row 14 - second half]
13161 pmaddubsw m1, m5, [r5 + 5 * 16]
13162 pmulhrsw m1, m3
13163 packuswb m1, m1
13164 movh [r0 + 238 * 16 + 8], m1
13165 ; mode 16 [row 14 - second half]
13166
13167 ; mode 16 [row 3]
13168 pslldq m6, m5, 2
13169 pinsrb m6, [r3 + 2], 1
13170 pinsrb m6, [r3 + 3], 0
13171 pmaddubsw m1, m6, [r5 + 12 * 16]
13172 pmulhrsw m1, m3
13173 packuswb m1, m1
13174 movh [r0 + 227 * 16], m1
13175
13176 ; mode 16 [row 15 - second half]
13177 pmaddubsw m1, m6, [r5 + 16 * 16]
13178 pmulhrsw m1, m3
13179 packuswb m1, m1
13180 movh [r0 + 239 * 16 + 8], m1
13181 ; mode 16 [row 15 - second half] end
13182
13183 ; mode 16 [row 4- first half]
13184 pslldq m6, 2
13185 pinsrb m6, [r3 + 3], 1
13186 pinsrb m6, [r3 + 5], 0
13187 pmaddubsw m1, m6, [r5 + 23 * 16]
13188 pmulhrsw m1, m3
13189 packuswb m1, m1
13190 movh [r0 + 228 * 16], m1
13191
13192 ; mode 16 [row 5- first half]
13193 pmaddubsw m1, m6, [r5 + 2 * 16]
13194 pmulhrsw m1, m3
13195 packuswb m1, m1
13196 movh [r0 + 229 * 16], m1
13197
13198 ; mode 16 [row 6- first half]
13199 pslldq m6, 2
13200 pinsrb m6, [r3 + 5], 1
13201 pinsrb m6, [r3 + 6], 0
13202 pmaddubsw m1, m6, [r5 + 13 * 16]
13203 pmulhrsw m1, m3
13204 packuswb m1, m1
13205 movh [r0 + 230 * 16], m1
13206
13207 ; mode 16 [row 7- first half]
13208 pslldq m6, 2
13209 pinsrb m6, [r3 + 6], 1
13210 pinsrb m6, [r3 + 8], 0
13211 pmaddubsw m1, m6, [r5 + 24 * 16]
13212 pmulhrsw m1, m3
13213 packuswb m1, m1
13214 movh [r0 + 231 * 16], m1
13215
13216 ; mode 16 [row 8- first half]
13217 pmaddubsw m1, m6, [r5 + 3 * 16]
13218 pmulhrsw m1, m3
13219 packuswb m1, m1
13220 movh [r0 + 232 * 16], m1
13221 ; mode 19 [row 0 - second half] end
13222
13223 ; mode 16 [row 9- first half]
13224 pslldq m6, 2
13225 pinsrb m6, [r3 + 8], 1
13226 pinsrb m6, [r3 + 9], 0
13227 pmaddubsw m1, m6, [r5 + 14 * 16]
13228 pmulhrsw m1, m3
13229 packuswb m1, m1
13230 movh [r0 + 233 * 16], m1
13231
13232 ; mode 16 [row 10 - first half]
13233 pslldq m6, 2
13234 pinsrb m6, [r3 + 9], 1
13235 pinsrb m6, [r3 + 11], 0
13236 pmaddubsw m1, m6, [r5 + 25 * 16]
13237 pmulhrsw m1, m3
13238 packuswb m1, m1
13239 movh [r0 + 234 * 16], m1
13240
13241 ; mode 16 [row 11 - first half]
13242 pmaddubsw m1, m6, [r5 + 4 * 16]
13243 pmulhrsw m1, m3
13244 packuswb m1, m1
13245 movh [r0 + 235 * 16], m1
13246
13247 ; mode 16 [row 12 - first half]
13248 pslldq m6, 2
13249 pinsrb m6, [r3 + 11], 1
13250 pinsrb m6, [r3 + 12], 0
13251 pmaddubsw m1, m6, [r5 + 15 * 16]
13252 pmulhrsw m1, m3
13253 packuswb m1, m1
13254 movh [r0 + 236 * 16], m1
13255
13256 ; mode 16 [row 13 - first half]
13257 pslldq m6, 2
13258 pinsrb m6, [r3 + 12], 1
13259 pinsrb m6, [r3 + 14], 0
13260 pmaddubsw m1, m6, [r5 + 26 * 16]
13261 pmulhrsw m1, m3
13262 packuswb m1, m1
13263 movh [r0 + 237 * 16], m1
13264
13265 ; mode 16 [row 14 - first half]
13266 pmaddubsw m1, m6, [r5 + 5 * 16]
13267 pmulhrsw m1, m3
13268 packuswb m1, m1
13269 movh [r0 + 238 * 16], m1
13270
13271 ; mode 16 [row 15 - first half]
13272 pslldq m6, 2
13273 pinsrb m6, [r3 + 14], 1
13274 pinsrb m6, [r3 + 15], 0
13275 pmaddubsw m1, m6, [r5 + 16 * 16]
13276 pmulhrsw m1, m3
13277 packuswb m1, m1
13278 movh [r0 + 239 * 16], m1
13279
13280 ; mode 14 [row 4]
13281 pslldq m5, 2
13282 pinsrb m5, [r3 + 2], 1
13283 pinsrb m5, [r3 + 5], 0
13284 movu m4, [r4 + 6]
13285 psrldq m0, m4, 1
13286 punpcklbw m4, m0
13287
13288 ; mode 16 [row 3 - second half]
13289 pmaddubsw m1, m4, [r5 + 12 * 16]
13290 pmulhrsw m1, m3
13291 packuswb m1, m1
13292 movh [r0 + 227 * 16 + 8], m1
13293
13294 ; mode 16 [row 3 - second half] end
13295 pmaddubsw m1, m5, [r5 + 31 * 16]
13296 pmulhrsw m1, m3
13297 pmaddubsw m0, m4, [r5 + 31 * 16]
13298 pmulhrsw m0, m3
13299 packuswb m1, m0
13300 movu [r0 + 196 * 16], m1
13301
13302 ; mode 14 [row 5]
13303 pmaddubsw m1, m5, [r5 + 18 * 16]
13304 pmulhrsw m1, m3
13305 pmaddubsw m0, m4, [r5 + 18 * 16]
13306 pmulhrsw m0, m3
13307 packuswb m1, m0
13308 movu [r0 + 197 * 16], m1
13309
13310 ; mode 14 [row 6]
13311 pmaddubsw m1, m5, [r5 + 5 * 16]
13312 pmulhrsw m1, m3
13313 pmaddubsw m0, m4, [r5 + 5 * 16]
13314 pmulhrsw m0, m3
13315 packuswb m1, m0
13316 movu [r0 + 198 * 16], m1
13317
13318 ; mode 15 [row 3]
13319 movu m6, m5
13320 pinsrb m6, [r3 + 4], 0
13321 pmaddubsw m1, m6, [r5 + 28 * 16]
13322 pmulhrsw m1, m3
13323 pmaddubsw m0, m4, [r5 + 28 * 16]
13324 pmulhrsw m0, m3
13325 packuswb m1, m0
13326 movu [r0 + 211 * 16], m1
13327
13328 ; mode 15 [row 4]
13329 pmaddubsw m1, m6, [r5 + 11 * 16]
13330 pmulhrsw m1, m3
13331 pmaddubsw m0, m4, [r5 + 11 * 16]
13332 pmulhrsw m0, m3
13333 packuswb m1, m0
13334 movu [r0 + 212 * 16], m1
13335
13336 ; mode 15 [row 5 - first half]
13337 pslldq m6, 2
13338 pinsrb m6, [r3 + 4], 1
13339 pinsrb m6, [r3 + 6], 0
13340 pmaddubsw m1, m6, [r5 + 26 * 16]
13341 pmulhrsw m1, m3
13342 packuswb m1, m1
13343 movh [r0 + 213 * 16], m1
13344
13345 ; mode 15 [row 6 - first half]
13346 pmaddubsw m1, m6, [r5 + 9 * 16]
13347 pmulhrsw m1, m3
13348 packuswb m1, m1
13349 movh [r0 + 214 * 16], m1
13350
13351 ; mode 15 [row 7 - first half]
13352 pslldq m6, 2
13353 pinsrb m6, [r3 + 6], 1
13354 pinsrb m6, [r3 + 8], 0
13355 pmaddubsw m1, m6, [r5 + 24 * 16]
13356 pmulhrsw m1, m3
13357 packuswb m1, m1
13358 movh [r0 + 215 * 16], m1
13359
13360 ; mode 15 [row 8 - first half]
13361 pmaddubsw m1, m6, [r5 + 7 * 16]
13362 pmulhrsw m1, m3
13363 packuswb m1, m1
13364 movh [r0 + 216 * 16], m1
13365
13366 ; mode 15 [row 9 - first half]
13367 pslldq m6, 2
13368 pinsrb m6, [r3 + 8], 1
13369 pinsrb m6, [r3 + 9], 0
13370 pmaddubsw m1, m6, [r5 + 22 * 16]
13371 pmulhrsw m1, m3
13372 packuswb m1, m1
13373 movh [r0 + 217 * 16], m1
13374
13375 ; mode 15 [row 10 - first half]
13376 pmaddubsw m1, m6, [r5 + 5 * 16]
13377 pmulhrsw m1, m3
13378 packuswb m1, m1
13379 movh [r0 + 218 * 16], m1
13380
13381 ; mode 15 [row 11 - first half]
13382 pslldq m6, 2
13383 pinsrb m6, [r3 + 9], 1
13384 pinsrb m6, [r3 + 11], 0
13385 pmaddubsw m1, m6, [r5 + 20 * 16]
13386 pmulhrsw m1, m3
13387 packuswb m1, m1
13388 movh [r0 + 219 * 16], m1
13389
13390 ; mode 15 [row 12 - first half]
13391 pmaddubsw m1, m6, [r5 + 3 * 16]
13392 pmulhrsw m1, m3
13393 packuswb m1, m1
13394 movh [r0 + 220 * 16], m1
13395
13396 ; mode 15 [row 13 - first half]
13397 pslldq m6, 2
13398 pinsrb m6, [r3 + 11], 1
13399 pinsrb m6, [r3 + 13], 0
13400 pmaddubsw m1, m6, [r5 + 18 * 16]
13401 pmulhrsw m1, m3
13402 packuswb m1, m1
13403 movh [r0 + 221 * 16], m1
13404
13405 ; mode 15 [row 14 - first half]
13406 pmaddubsw m1, m6, [r5 + 1 * 16]
13407 pmulhrsw m1, m3
13408 packuswb m1, m1
13409 movh [r0 + 222 * 16], m1
13410
13411 ; mode 15 [row 15 - first half]
13412 pslldq m6, 2
13413 pinsrb m6, [r3 + 13], 1
13414 pinsrb m6, [r3 + 15], 0
13415 pmaddubsw m1, m6, [r5 + 16 * 16]
13416 pmulhrsw m1, m3
13417 packuswb m1, m1
13418 movh [r0 + 223 * 16], m1
13419
13420 ; mode 14 [row 7]
13421 pslldq m5, 2
13422 pinsrb m5, [r3 + 5], 1
13423 pinsrb m5, [r3 + 7], 0
13424 movu m0, [r4 + 5]
13425 psrldq m6, m0, 1
13426 punpcklbw m0, m6
13427
13428 ; mode 15 [row 5 - second half]
13429 pmaddubsw m1, m0, [r5 + 26 * 16]
13430 pmulhrsw m1, m3
13431 packuswb m1, m1
13432 movh [r0 + 213 * 16 + 8], m1
13433 ; mode 15 [row 5 - second half] end
13434
13435 ; mode 15 [row 6 - second half]
13436 pmaddubsw m1, m0, [r5 + 9 * 16]
13437 pmulhrsw m1, m3
13438 packuswb m1, m1
13439 movh [r0 + 214 * 16 + 8], m1
13440 ; mode 15 [row 6 - second half] end
13441
13442 ; mode 16 [row 4 - second half]
13443 pmaddubsw m1, m0, [r5 + 23 * 16]
13444 pmulhrsw m1, m3
13445 packuswb m1, m1
13446 movh [r0 + 228 * 16 + 8], m1
13447 ; mode 16 [row 4 - second half] end
13448
13449 ; mode 16 [row 5 - second half]
13450 pmaddubsw m1, m0, [r5 + 2 * 16]
13451 pmulhrsw m1, m3
13452 packuswb m1, m1
13453 movh [r0 + 229 * 16 + 8], m1
13454
13455 ; mode 16 [row 5 - second half] end
13456 pmaddubsw m1, m5, [r5 + 24 * 16]
13457 pmulhrsw m1, m3
13458 pmaddubsw m6, m0, [r5 + 24 * 16]
13459 pmulhrsw m6, m3
13460 packuswb m1, m6
13461 movu [r0 + 199 * 16], m1
13462
13463 ; mode 14 [row 8]
13464 pmaddubsw m1, m5, [r5 + 11 * 16]
13465 pmulhrsw m1, m3
13466 pmaddubsw m6, m0, [r5 + 11 * 16]
13467 pmulhrsw m6, m3
13468 packuswb m1, m6
13469 movu [r0 + 200 * 16], m1
13470
13471 ; mode 14 [row 9]
13472 pslldq m5, 2
13473 pinsrb m5, [r3 + 7], 1
13474 pinsrb m5, [r3 + 10], 0
13475 movu m0, [r4 + 4]
13476 psrldq m6, m0, 1
13477 punpcklbw m0, m6
13478
13479 ; mode 15 [row 7 - second half]
13480 pmaddubsw m1, m0, [r5 + 24 * 16]
13481 pmulhrsw m1, m3
13482 packuswb m1, m1
13483 movh [r0 + 215 * 16 + 8], m1
13484 ; mode 15 [row 7 - second half] end
13485
13486 ; mode 15 [row 8 - second half]
13487 pmaddubsw m1, m0, [r5 + 7 * 16]
13488 pmulhrsw m1, m3
13489 packuswb m1, m1
13490 movh [r0 + 216 * 16 + 8], m1
13491 ; mode 15 [row 8 - second half] end
13492
13493 ; mode 16 [row 6 - second half]
13494 pmaddubsw m1, m0, [r5 + 13 * 16]
13495 pmulhrsw m1, m3
13496 packuswb m1, m1
13497 movh [r0 + 230 * 16 + 8], m1
13498 ; mode 16 [row 6 - second half] end
13499
13500 ; mode 15 [row 6 - second half] end
13501 pmaddubsw m1, m5, [r5 + 30 * 16]
13502 pmulhrsw m1, m3
13503 pmaddubsw m6, m0, [r5 + 30 * 16]
13504 pmulhrsw m6, m3
13505 packuswb m1, m6
13506 movu [r0 + 201 * 16], m1
13507
13508 ; mode 14 [row 10]
13509 pmaddubsw m1, m5, [r5 + 17 * 16]
13510 pmulhrsw m1, m3
13511 pmaddubsw m6, m0, [r5 + 17 * 16]
13512 pmulhrsw m6, m3
13513 packuswb m1, m6
13514 movu [r0 + 202 * 16], m1
13515
13516 ; mode 14 [row 11]
13517 pmaddubsw m1, m5, [r5 + 4 * 16]
13518 pmulhrsw m1, m3
13519 pmaddubsw m6, m0, [r5 + 4 * 16]
13520 pmulhrsw m6, m3
13521 packuswb m1, m6
13522 movu [r0 + 203 * 16], m1
13523
13524 ; mode 14 [row 12]
13525 pslldq m5, 2
13526 pinsrb m5, [r3 + 10], 1
13527 pinsrb m5, [r3 + 12], 0
13528 movu m0, [r4 + 3]
13529 psrldq m6, m0, 1
13530 punpcklbw m0, m6
13531
13532 ; mode 15 [row 9 - second half]
13533 pmaddubsw m1, m0, [r5 + 22 * 16]
13534 pmulhrsw m1, m3
13535 packuswb m1, m1
13536 movh [r0 + 217 * 16 + 8], m1
13537 ; mode 15 [row 9 - second half] end
13538
13539 ; mode 15 [row 10 - second half]
13540 pmaddubsw m1, m0, [r5 + 5 * 16]
13541 pmulhrsw m1, m3
13542 packuswb m1, m1
13543 movh [r0 + 218 * 16 + 8], m1
13544 ; mode 15 [row 10 - second half] end
13545
13546 ; mode 16 [row 7 - second half]
13547 pmaddubsw m1, m0, [r5 + 24 * 16]
13548 pmulhrsw m1, m3
13549 packuswb m1, m1
13550 movh [r0 + 231 * 16 + 8], m1
13551 ; mode 16 [row 7 - second half] end
13552
13553 ; mode 16 [row 8 - second half]
13554 pmaddubsw m1, m0, [r5 + 3 * 16]
13555 pmulhrsw m1, m3
13556 packuswb m1, m1
13557 movh [r0 + 232 * 16 + 8], m1
13558 ; mode 16 [row 8 - second half] end
13559
13560 pmaddubsw m1, m5, [r5 + 23 * 16]
13561 pmulhrsw m1, m3
13562 pmaddubsw m6, m0, [r5 + 23 * 16]
13563 pmulhrsw m6, m3
13564 packuswb m1, m6
13565 movu [r0 + 204 * 16], m1
13566
13567 ; mode 14 [row 13]
13568 pmaddubsw m1, m5, [r5 + 10 * 16]
13569 pmulhrsw m1, m3
13570 pmaddubsw m6, m0, [r5 + 10 * 16]
13571 pmulhrsw m6, m3
13572 packuswb m1, m6
13573 movu [r0 + 205 * 16], m1
13574
13575 ; mode 14 [row 14]
13576 pslldq m5, 2
13577 pinsrb m5, [r3 + 12], 1
13578 pinsrb m5, [r3 + 15], 0
13579 movu m0, [r4 + 2]
13580 psrldq m6, m0, 1
13581 punpcklbw m0, m6
13582
13583 ; mode 15 [row 11 - second half]
13584 pmaddubsw m1, m0, [r5 + 20 * 16]
13585 pmulhrsw m1, m3
13586 packuswb m1, m1
13587 movh [r0 + 219 * 16 + 8], m1
13588 ; mode 15 [row 11 - second half] end
13589
13590 ; mode 15 [row 12 - second half]
13591 pmaddubsw m1, m0, [r5 + 3 * 16]
13592 pmulhrsw m1, m3
13593 packuswb m1, m1
13594 movh [r0 + 220 * 16 + 8], m1
13595 ; mode 15 [row 12 - second half] end
13596
13597 ; mode 16 [row 9 - second half]
13598 pmaddubsw m1, m0, [r5 + 14 * 16]
13599 pmulhrsw m1, m3
13600 packuswb m1, m1
13601 movh [r0 + 233 * 16 + 8], m1
13602
13603 ; mode 16 [row 9 - second half] end
13604 pmaddubsw m1, m5, [r5 + 29 * 16]
13605 pmulhrsw m1, m3
13606 pmaddubsw m6, m0, [r5 + 29 * 16]
13607 pmulhrsw m6, m3
13608 packuswb m1, m6
13609 movu [r0 + 206 * 16], m1
13610
13611 ; mode 14 [row 15]
13612 pmaddubsw m1, m5, [r5 + 16 * 16]
13613 pmulhrsw m1, m3
13614 pmaddubsw m6, m0, [r5 + 16 * 16]
13615 pmulhrsw m6, m3
13616 packuswb m1, m6
13617 movu [r0 + 207 * 16], m1
13618
13619 ; mode 12 [row 12]
13620 pslldq m0, m2, 2
13621 pinsrb m0, [r3 + 6], 1
13622 pinsrb m0, [r3 + 13], 0
13623 pmaddubsw m1, m0, [r5 + 31 * 16]
13624 pmulhrsw m1, m3
13625 pmaddubsw m5, m4, [r5 + 31 * 16]
13626 pmulhrsw m5, m3
13627 packuswb m1, m5
13628 movu [r0 + 172 * 16], m1
13629
13630 ; mode 12 [row 13]
13631 pmaddubsw m1, m0, [r5 + 26 * 16]
13632 pmulhrsw m1, m3
13633 pmaddubsw m5, m4, [r5 + 26 * 16]
13634 pmulhrsw m5, m3
13635 packuswb m1, m5
13636 movu [r0 + 173 * 16], m1
13637
13638 ; mode 12 [row 14]
13639 pmaddubsw m1, m0, [r5 + 21 * 16]
13640 pmulhrsw m1, m3
13641 pmaddubsw m5, m4, [r5 + 21 * 16]
13642 pmulhrsw m5, m3
13643 packuswb m1, m5
13644 movu [r0 + 174 * 16], m1
13645
13646 ; mode 12 [row 15]
13647 pmaddubsw m1, m0, [r5 + 16 * 16]
13648 pmulhrsw m1, m3
13649 pmaddubsw m5, m4, [r5 + 16 * 16]
13650 pmulhrsw m5, m3
13651 packuswb m1, m5
13652 movu [r0 + 175 * 16], m1
13653
13654 ; mode 13 [row 7]
13655 pslldq m7, 2
13656 pinsrb m7, [r3 + 4], 1
13657 pinsrb m7, [r3 + 7], 0
13658 pmaddubsw m1, m7, [r5 + 24 * 16]
13659 pmulhrsw m1, m3
13660 pmaddubsw m5, m4, [r5 + 24 * 16]
13661 pmulhrsw m5, m3
13662 packuswb m1, m5
13663 movu [r0 + 183 * 16], m1
13664
13665 ; mode 13 [row 8]
13666 pmaddubsw m1, m7, [r5 + 15 * 16]
13667 pmulhrsw m1, m3
13668 pmaddubsw m5, m4, [r5 + 15 * 16]
13669 pmulhrsw m5, m3
13670 packuswb m1, m5
13671 movu [r0 + 184 * 16], m1
13672
13673 ; mode 13 [row 9]
13674 pmaddubsw m1, m7, [r5 + 6 * 16]
13675 pmulhrsw m1, m3
13676 pmaddubsw m5, m4, [r5 + 6 * 16]
13677 pmulhrsw m5, m3
13678 packuswb m1, m5
13679 movu [r0 + 185 * 16], m1
13680
13681 ; mode 13 [row 10]
13682 pslldq m7, 2
13683 pinsrb m7, [r3 + 7], 1
13684 pinsrb m7, [r3 + 11], 0
13685 pmaddubsw m1, m7, [r5 + 29 * 16]
13686 pmulhrsw m1, m3
13687 movu m4, [r4 + 5]
13688 psrldq m5, m4, 1
13689 punpcklbw m4, m5
13690 pmaddubsw m5, m4, [r5 + 29 * 16]
13691 pmulhrsw m5, m3
13692 packuswb m1, m5
13693 movu [r0 + 186 * 16], m1
13694
13695 ; mode 13 [row 11]
13696 pmaddubsw m1, m7, [r5 + 20 * 16]
13697 pmulhrsw m1, m3
13698 pmaddubsw m5, m4, [r5 + 20 * 16]
13699 pmulhrsw m5, m3
13700 packuswb m1, m5
13701 movu [r0 + 187 * 16], m1
13702
13703 ; mode 13 [row 12]
13704 pmaddubsw m1, m7, [r5 + 11 * 16]
13705 pmulhrsw m1, m3
13706 pmaddubsw m5, m4, [r5 + 11 * 16]
13707 pmulhrsw m5, m3
13708 packuswb m1, m5
13709 movu [r0 + 188 * 16], m1
13710
13711 ; mode 13 [row 13]
13712 pmaddubsw m1, m7, [r5 + 2 * 16]
13713 pmulhrsw m1, m3
13714 pmaddubsw m5, m4, [r5 + 2 * 16]
13715 pmulhrsw m5, m3
13716 packuswb m1, m5
13717 movu [r0 + 189 * 16], m1
13718
13719 ; mode 13 [row 14]
13720 pslldq m7, 2
13721 pinsrb m7, [r3 + 11], 1
13722 pinsrb m7, [r3 + 14], 0
13723 pmaddubsw m1, m7, [r5 + 25 * 16]
13724 pmulhrsw m1, m3
13725 movu m4, [r4 + 4]
13726 psrldq m5, m4, 1
13727 punpcklbw m4, m5
13728 pmaddubsw m5, m4, [r5 + 25 * 16]
13729 pmulhrsw m5, m3
13730 packuswb m1, m5
13731 movu [r0 + 190 * 16], m1
13732
13733 ; mode 13 [row 15]
13734 pmaddubsw m1, m7, [r5 + 16 * 16]
13735 pmulhrsw m1, m3
13736 pmaddubsw m5, m4, [r5 + 16 * 16]
13737 pmulhrsw m5, m3
13738 packuswb m1, m5
13739 movu [r0 + 191 * 16], m1
13740
13741 ; mode 17 [row 15]
13742 movu m0, [r3]
13743 pshufb m1, m0, [tab_S1]
13744 movu [r0 + 255 * 16], m1
13745 movu m2, [r4]
13746 movd [r0 + 255 * 16 + 12], m2
13747
13748 ; mode 18 [row 0]
13749 movu [r0 + 256 * 16], m0
13750
13751 ; mode 18 [row 1]
13752 pslldq m4, m0, 1
13753 pinsrb m4, [r4 + 1], 0
13754 movu [r0 + 257 * 16], m4
13755 pslldq m4, 1
13756 pinsrb m4, [r4 + 2], 0
13757 movu [r0 + 258 * 16], m4
13758 pslldq m4, 1
13759 pinsrb m4, [r4 + 3], 0
13760 movu [r0 + 259 * 16], m4
13761 pslldq m4, 1
13762 pinsrb m4, [r4 + 4], 0
13763 movu [r0 + 260 * 16], m4
13764 pslldq m4, 1
13765 pinsrb m4, [r4 + 5], 0
13766 movu [r0 + 261 * 16], m4
13767 pslldq m4, 1
13768 pinsrb m4, [r4 + 6], 0
13769 movu [r0 + 262 * 16], m4
13770 pslldq m4, 1
13771 pinsrb m4, [r4 + 7], 0
13772 movu [r0 + 263 * 16], m4
13773 pslldq m4, 1
13774 pinsrb m4, [r4 + 8], 0
13775 movu [r0 + 264 * 16], m4
13776 pslldq m4, 1
13777 pinsrb m4, [r4 + 9], 0
13778 movu [r0 + 265 * 16], m4
13779 pslldq m4, 1
13780 pinsrb m4, [r4 + 10], 0
13781 movu [r0 + 266 * 16], m4
13782 pslldq m4, 1
13783 pinsrb m4, [r4 + 11], 0
13784 movu [r0 + 267 * 16], m4
13785 pslldq m4, 1
13786 pinsrb m4, [r4 + 12], 0
13787 movu [r0 + 268 * 16], m4
13788 pslldq m4, 1
13789 pinsrb m4, [r4 + 13], 0
13790 movu [r0 + 269 * 16], m4
13791 pslldq m4, 1
13792 pinsrb m4, [r4 + 14], 0
13793 movu [r0 + 270 * 16], m4
13794 pslldq m4, 1
13795 pinsrb m4, [r4 + 15], 0
13796 movu [r0 + 271 * 16], m4
13797
13798 ; mode 19 [row 0]
13799 psrldq m2, m0, 1
13800 punpcklbw m0, m2
13801 movu m5, [r3 + 8]
13802 psrldq m6, m5, 1
13803 punpcklbw m5, m6
13804 pmaddubsw m4, m0, [r5 + 6 * 16]
13805 pmulhrsw m4, m3
13806 pmaddubsw m6, m5, [r5 + 6 * 16]
13807 pmulhrsw m6, m3
13808 packuswb m4, m6
13809 movu [r0 + 272 * 16], m4
13810
13811 ; mode 20 [row 0]
13812 pmaddubsw m4, m0, [r5 + 11 * 16]
13813 pmulhrsw m4, m3
13814 pmaddubsw m6, m5, [r5 + 11 * 16]
13815 pmulhrsw m6, m3
13816 packuswb m4, m6
13817 movu [r0 + 288 * 16], m4
13818
13819 ; mode 21 [row 0]
13820 pmaddubsw m4, m0, [r5 + 15 * 16]
13821 pmulhrsw m4, m3
13822 pmaddubsw m6, m5, [r5 + 15 * 16]
13823 pmulhrsw m6, m3
13824 packuswb m4, m6
13825 movu [r0 + 304 * 16], m4
13826
13827 ; mode 22 [row 0]
13828 pmaddubsw m4, m0, [r5 + 19 * 16]
13829 pmulhrsw m4, m3
13830 pmaddubsw m6, m5, [r5 + 19 * 16]
13831 pmulhrsw m6, m3
13832 packuswb m4, m6
13833 movu [r0 + 320 * 16], m4
13834
13835 ; mode 22 [row 1]
13836 pmaddubsw m4, m0, [r5 + 6 * 16]
13837 pmulhrsw m4, m3
13838 pmaddubsw m6, m5, [r5 + 6 * 16]
13839 pmulhrsw m6, m3
13840 packuswb m4, m6
13841 movu [r0 + 321 * 16], m4
13842
13843 ; mode 23 [row 0]
13844 pmaddubsw m4, m0, [r5 + 23 * 16]
13845 pmulhrsw m4, m3
13846 pmaddubsw m6, m5, [r5 + 23 * 16]
13847 pmulhrsw m6, m3
13848 packuswb m4, m6
13849 movu [r0 + 336 * 16], m4
13850
13851 ; mode 23 [row 1]
13852 pmaddubsw m4, m0, [r5 + 14 * 16]
13853 pmulhrsw m4, m3
13854 pmaddubsw m6, m5, [r5 + 14 * 16]
13855 pmulhrsw m6, m3
13856 packuswb m4, m6
13857 movu [r0 + 337 * 16], m4
13858
13859 ; mode 23 [row 2]
13860 pmaddubsw m4, m0, [r5 + 5 * 16]
13861 pmulhrsw m4, m3
13862 pmaddubsw m6, m5, [r5 + 5 * 16]
13863 pmulhrsw m6, m3
13864 packuswb m4, m6
13865 movu [r0 + 338 * 16], m4
13866
13867 ; mode 24 [row 0]
13868 pmaddubsw m4, m0, [r5 + 27 * 16]
13869 pmulhrsw m4, m3
13870 pmaddubsw m6, m5, [r5 + 27 * 16]
13871 pmulhrsw m6, m3
13872 packuswb m4, m6
13873 movu [r0 + 352 * 16], m4
13874
13875 ; mode 24 [row 1]
13876 pmaddubsw m4, m0, [r5 + 22 * 16]
13877 pmulhrsw m4, m3
13878 pmaddubsw m6, m5, [r5 + 22 * 16]
13879 pmulhrsw m6, m3
13880 packuswb m4, m6
13881 movu [r0 + 353 * 16], m4
13882
13883 ; mode 24 [row 2]
13884 pmaddubsw m4, m0, [r5 + 17 * 16]
13885 pmulhrsw m4, m3
13886 pmaddubsw m6, m5, [r5 + 17 * 16]
13887 pmulhrsw m6, m3
13888 packuswb m4, m6
13889 movu [r0 + 354 * 16], m4
13890
13891 ; mode 24 [row 3]
13892 pmaddubsw m4, m0, [r5 + 12 * 16]
13893 pmulhrsw m4, m3
13894 pmaddubsw m6, m5, [r5 + 12 * 16]
13895 pmulhrsw m6, m3
13896 packuswb m4, m6
13897 movu [r0 + 355 * 16], m4
13898
13899 ; mode 24 [row 4]
13900 pmaddubsw m4, m0, [r5 + 7 * 16]
13901 pmulhrsw m4, m3
13902 pmaddubsw m6, m5, [r5 + 7 * 16]
13903 pmulhrsw m6, m3
13904 packuswb m4, m6
13905 movu [r0 + 356 * 16], m4
13906
13907 ; mode 24 [row 5]
13908 pmaddubsw m4, m0, [r5 + 2 * 16]
13909 pmulhrsw m4, m3
13910 pmaddubsw m6, m5, [r5 + 2 * 16]
13911 pmulhrsw m6, m3
13912 packuswb m4, m6
13913 movu [r0 + 357 * 16], m4
13914
13915 ; mode 24 [row 6 - first half]
13916 pslldq m7, m0, 2
13917 pinsrb m7, [r4 + 0], 1
13918 pinsrb m7, [r4 + 6], 0
13919 pmaddubsw m4, m7, [r5 + 29 * 16]
13920 pmulhrsw m4, m3
13921 packuswb m4, m4
13922 movh [r0 + 358 * 16], m4
13923
13924 ; mode 24 [row 7 - first half]
13925 pmaddubsw m4, m7, [r5 + 24 * 16]
13926 pmulhrsw m4, m3
13927 packuswb m4, m4
13928 movh [r0 + 359 * 16], m4
13929
13930 ; mode 24 [row 8 - first half]
13931 pmaddubsw m4, m7, [r5 + 19 * 16]
13932 pmulhrsw m4, m3
13933 packuswb m4, m4
13934 movh [r0 + 360 * 16], m4
13935
13936 ; mode 24 [row 9 - first half]
13937 pmaddubsw m4, m7, [r5 + 14 * 16]
13938 pmulhrsw m4, m3
13939 packuswb m4, m4
13940 movh [r0 + 361 * 16], m4
13941
13942 ; mode 24 [row 10 - first half]
13943 pmaddubsw m4, m7, [r5 + 9 * 16]
13944 pmulhrsw m4, m3
13945 packuswb m4, m4
13946 movh [r0 + 362 * 16], m4
13947
13948 ; mode 24 [row 11 - first half]
13949 pmaddubsw m4, m7, [r5 + 4 * 16]
13950 pmulhrsw m4, m3
13951 packuswb m4, m4
13952 movh [r0 + 363 * 16], m4
13953
13954 ; mode 24 [row 12 - first half]
13955 pslldq m7, 2
13956 pinsrb m7, [r4 + 6], 1
13957 pinsrb m7, [r4 + 13], 0
13958 pmaddubsw m4, m7, [r5 + 31 * 16]
13959 pmulhrsw m4, m3
13960 packuswb m4, m4
13961 movh [r0 + 364 * 16], m4
13962
13963 ; mode 24 [row 13 - first half]
13964 pmaddubsw m4, m7, [r5 + 26 * 16]
13965 pmulhrsw m4, m3
13966 packuswb m4, m4
13967 movh [r0 + 365 * 16], m4
13968
13969 ; mode 24 [row 14 - first half]
13970 pmaddubsw m4, m7, [r5 + 21 * 16]
13971 pmulhrsw m4, m3
13972 packuswb m4, m4
13973 movh [r0 + 366 * 16], m4
13974
13975 ; mode 24 [row 15 - first half]
13976 pmaddubsw m4, m7, [r5 + 16 * 16]
13977 pmulhrsw m4, m3
13978 packuswb m4, m4
13979 movh [r0 + 367 * 16], m4
13980
13981 ; mode 23 [row 3 - first half]
13982 pslldq m7, m0, 2
13983 pinsrb m7, [r4 + 0], 1
13984 pinsrb m7, [r4 + 4], 0
13985 pmaddubsw m4, m7, [r5 + 28 * 16]
13986 pmulhrsw m4, m3
13987 packuswb m4, m4
13988 movh [r0 + 339 * 16], m4
13989
13990 ; mode 23 [row 4 - first half]
13991 pmaddubsw m4, m7, [r5 + 19 * 16]
13992 pmulhrsw m4, m3
13993 packuswb m4, m4
13994 movh [r0 + 340 * 16], m4
13995
13996 ; mode 23 [row 5 - first half]
13997 pmaddubsw m4, m7, [r5 + 10 * 16]
13998 pmulhrsw m4, m3
13999 packuswb m4, m4
14000 movh [r0 + 341 * 16], m4
14001
14002 ; mode 23 [row 6 - first half]
14003 pmaddubsw m4, m7, [r5 + 1 * 16]
14004 pmulhrsw m4, m3
14005 packuswb m4, m4
14006 movh [r0 + 342 * 16], m4
14007
14008 ; mode 23 [row 7 - first half]
14009 pslldq m7, 2
14010 pinsrb m7, [r4 + 4], 1
14011 pinsrb m7, [r4 + 7], 0
14012 pmaddubsw m4, m7, [r5 + 24 * 16]
14013 pmulhrsw m4, m3
14014 packuswb m4, m4
14015 movh [r0 + 343 * 16], m4
14016
14017 ; mode 23 [row 8 - first half]
14018 pmaddubsw m4, m7, [r5 + 15 * 16]
14019 pmulhrsw m4, m3
14020 packuswb m4, m4
14021 movh [r0 + 344 * 16], m4
14022
14023 ; mode 23 [row 9 - first half]
14024 pmaddubsw m4, m7, [r5 + 6 * 16]
14025 pmulhrsw m4, m3
14026 packuswb m4, m4
14027 movh [r0 + 345 * 16], m4
14028
14029 ; mode 23 [row 10 - first half]
14030 pslldq m7, 2
14031 pinsrb m7, [r4 + 7], 1
14032 pinsrb m7, [r4 + 11], 0
14033 pmaddubsw m4, m7, [r5 + 29 * 16]
14034 pmulhrsw m4, m3
14035 packuswb m4, m4
14036 movh [r0 + 346 * 16], m4
14037
14038 ; mode 23 [row 11 - first half]
14039 pmaddubsw m4, m7, [r5 + 20 * 16]
14040 pmulhrsw m4, m3
14041 packuswb m4, m4
14042 movh [r0 + 347 * 16], m4
14043
14044 ; mode 23 [row 12 - first half]
14045 pmaddubsw m4, m7, [r5 + 11 * 16]
14046 pmulhrsw m4, m3
14047 packuswb m4, m4
14048 movh [r0 + 348 * 16], m4
14049
14050 ; mode 23 [row 13 - first half]
14051 pmaddubsw m4, m7, [r5 + 2 * 16]
14052 pmulhrsw m4, m3
14053 packuswb m4, m4
14054 movh [r0 + 349 * 16], m4
14055
14056 ; mode 23 [row 14 - first half]
14057 pslldq m7, 2
14058 pinsrb m7, [r4 + 11], 1
14059 pinsrb m7, [r4 + 14], 0
14060 pmaddubsw m4, m7, [r5 + 25 * 16]
14061 pmulhrsw m4, m3
14062 packuswb m4, m4
14063 movh [r0 + 350 * 16], m4
14064
14065 ; mode 23 [row 15 - first half]
14066 pmaddubsw m4, m7, [r5 + 16 * 16]
14067 pmulhrsw m4, m3
14068 packuswb m4, m4
14069 movh [r0 + 351 * 16], m4
14070
14071 ; mode 21 [row 15 - first half]
14072 pmaddubsw m4, m0, [r5 + 16 * 16]
14073 pmulhrsw m4, m3
14074 packuswb m4, m4
14075 movh [r0 + 319 * 16 + 8], m4
14076 ; mode 21 [row 15 - second half] end
14077
14078 ; mode 20 [row 1 - first half]
14079 pslldq m7, m0, 2
14080 pinsrb m7, [r4 + 0], 1
14081 pinsrb m7, [r4 + 2], 0
14082 pmaddubsw m4, m7, [r5 + 22 * 16]
14083 pmulhrsw m4, m3
14084 packuswb m4, m4
14085 movh [r0 + 289 * 16], m4
14086
14087 ; mode 20 [row 2 - first half]
14088 pmaddubsw m4, m7, [r5 + 1 * 16]
14089 pmulhrsw m4, m3
14090 packuswb m4, m4
14091 movh [r0 + 290 * 16], m4
14092
14093 ; mode 21 [row 1 - first half]
14094 pmaddubsw m4, m7, [r5 + 30 * 16]
14095 pmulhrsw m4, m3
14096 packuswb m4, m4
14097 movh [r0 + 305 * 16], m4
14098
14099 ; mode 21 [row 2 - first half]
14100 pmaddubsw m4, m7, [r5 + 13 * 16]
14101 pmulhrsw m4, m3
14102 packuswb m4, m4
14103 movh [r0 + 306 * 16], m4
14104
14105 ; mode 22 [row 2 - first half]
14106 pmaddubsw m4, m7, [r5 + 25 * 16]
14107 pmulhrsw m4, m3
14108 packuswb m4, m4
14109 movh [r0 + 322 * 16], m4
14110
14111 ; mode 22 [row 3 - first half]
14112 pmaddubsw m4, m7, [r5 + 12 * 16]
14113 pmulhrsw m4, m3
14114 packuswb m4, m4
14115 movh [r0 + 323 * 16], m4
14116
14117 ; mode 22 [row 4 - first half]
14118 pslldq m1, m7, 2
14119 pinsrb m1, [r4 + 2], 1
14120 pinsrb m1, [r4 + 5], 0
14121 pmaddubsw m4, m1, [r5 + 31 * 16]
14122 pmulhrsw m4, m3
14123 packuswb m4, m4
14124 movh [r0 + 324 * 16], m4
14125
14126 ; mode 22 [row 5 - first half]
14127 pmaddubsw m4, m1, [r5 + 18 * 16]
14128 pmulhrsw m4, m3
14129 packuswb m4, m4
14130 movh [r0 + 325 * 16], m4
14131
14132 ; mode 22 [row 6 - first half]
14133 pmaddubsw m4, m1, [r5 + 5 * 16]
14134 pmulhrsw m4, m3
14135 packuswb m4, m4
14136 movh [r0 + 326 * 16], m4
14137
14138 ; mode 22 [row 7 - first half]
14139 pslldq m1, 2
14140 pinsrb m1, [r4 + 5], 1
14141 pinsrb m1, [r4 + 7], 0
14142 pmaddubsw m4, m1, [r5 + 24 * 16]
14143 pmulhrsw m4, m3
14144 packuswb m4, m4
14145 movh [r0 + 327 * 16], m4
14146
14147 ; mode 22 [row 8 - first half]
14148 pmaddubsw m4, m1, [r5 + 11 * 16]
14149 pmulhrsw m4, m3
14150 packuswb m4, m4
14151 movh [r0 + 328 * 16], m4
14152
14153 ; mode 22 [row 9 - first half]
14154 pslldq m1, 2
14155 pinsrb m1, [r4 + 7], 1
14156 pinsrb m1, [r4 + 10], 0
14157 pmaddubsw m4, m1, [r5 + 30 * 16]
14158 pmulhrsw m4, m3
14159 packuswb m4, m4
14160 movh [r0 + 329 * 16], m4
14161
14162 ; mode 22 [row 10 - first half]
14163 pmaddubsw m4, m1, [r5 + 17 * 16]
14164 pmulhrsw m4, m3
14165 packuswb m4, m4
14166 movh [r0 + 330 * 16], m4
14167
14168 ; mode 22 [row 11 - first half]
14169 pmaddubsw m4, m1, [r5 + 4 * 16]
14170 pmulhrsw m4, m3
14171 packuswb m4, m4
14172 movh [r0 + 331 * 16], m4
14173
14174 ; mode 22 [row 12 - first half]
14175 pslldq m1, 2
14176 pinsrb m1, [r4 + 10], 1
14177 pinsrb m1, [r4 + 12], 0
14178 pmaddubsw m4, m1, [r5 + 23 * 16]
14179 pmulhrsw m4, m3
14180 packuswb m4, m4
14181 movh [r0 + 332 * 16], m4
14182
14183 ; mode 22 [row 13 - first half]
14184 pmaddubsw m4, m1, [r5 + 10 * 16]
14185 pmulhrsw m4, m3
14186 packuswb m4, m4
14187 movh [r0 + 333 * 16], m4
14188
14189 ; mode 22 [row 14 - first half]
14190 pslldq m1, 2
14191 pinsrb m1, [r4 + 12], 1
14192 pinsrb m1, [r4 + 15], 0
14193 pmaddubsw m4, m1, [r5 + 29 * 16]
14194 pmulhrsw m4, m3
14195 packuswb m4, m4
14196 movh [r0 + 334 * 16], m4
14197
14198 ; mode 22 [row 15 - first half]
14199 pmaddubsw m4, m1, [r5 + 16 * 16]
14200 pmulhrsw m4, m3
14201 packuswb m4, m4
14202 movh [r0 + 335 * 16], m4
14203
14204 ; mode 21 [row 3 - first half]
14205 pslldq m6, m7, 2
14206 pinsrb m6, [r4 + 2], 1
14207 pinsrb m6, [r4 + 4], 0
14208 pmaddubsw m4, m6, [r5 + 28 * 16]
14209 pmulhrsw m4, m3
14210 packuswb m4, m4
14211 movh [r0 + 307 * 16], m4
14212
14213 ; mode 21 [row 4 - first half]
14214 pmaddubsw m4, m6, [r5 + 11 * 16]
14215 pmulhrsw m4, m3
14216 packuswb m4, m4
14217 movh [r0 + 308 * 16], m4
14218
14219 ; mode 21 [row 5 - first half]
14220 pslldq m6, 2
14221 pinsrb m6, [r4 + 4], 1
14222 pinsrb m6, [r4 + 6], 0
14223 pmaddubsw m4, m6, [r5 + 26 * 16]
14224 pmulhrsw m4, m3
14225 packuswb m4, m4
14226 movh [r0 + 309 * 16], m4
14227
14228 ; mode 21 [row 6 - first half]
14229 pmaddubsw m4, m6, [r5 + 9 * 16]
14230 pmulhrsw m4, m3
14231 packuswb m4, m4
14232 movh [r0 + 310 * 16], m4
14233
14234 ; mode 21 [row 7 - first half]
14235 pslldq m6, 2
14236 pinsrb m6, [r4 + 6], 1
14237 pinsrb m6, [r4 + 8], 0
14238 pmaddubsw m4, m6, [r5 + 24 * 16]
14239 pmulhrsw m4, m3
14240 packuswb m4, m4
14241 movh [r0 + 311 * 16], m4
14242
14243 ; mode 21 [row 8 - first half]
14244 pmaddubsw m4, m6, [r5 + 7 * 16]
14245 pmulhrsw m4, m3
14246 packuswb m4, m4
14247 movh [r0 + 312 * 16], m4
14248
14249 ; mode 21 [row 9 - first half]
14250 pslldq m6, 2
14251 pinsrb m6, [r4 + 8], 1
14252 pinsrb m6, [r4 + 9], 0
14253 pmaddubsw m4, m6, [r5 + 22 * 16]
14254 pmulhrsw m4, m3
14255 packuswb m4, m4
14256 movh [r0 + 313 * 16], m4
14257
14258 ; mode 21 [row 10 - first half]
14259 pmaddubsw m4, m6, [r5 + 5 * 16]
14260 pmulhrsw m4, m3
14261 packuswb m4, m4
14262 movh [r0 + 314 * 16], m4
14263
14264 ; mode 21 [row 11 - first half]
14265 pslldq m6, 2
14266 pinsrb m6, [r4 + 9], 1
14267 pinsrb m6, [r4 + 11], 0
14268 pmaddubsw m4, m6, [r5 + 20 * 16]
14269 pmulhrsw m4, m3
14270 packuswb m4, m4
14271 movh [r0 + 315 * 16], m4
14272
14273 ; mode 21 [row 12 - first half]
14274 pmaddubsw m4, m6, [r5 + 3 * 16]
14275 pmulhrsw m4, m3
14276 packuswb m4, m4
14277 movh [r0 + 316 * 16], m4
14278
14279 ; mode 21 [row 13 - first half]
14280 pslldq m6, 2
14281 pinsrb m6, [r4 + 11], 1
14282 pinsrb m6, [r4 + 13], 0
14283 pmaddubsw m4, m6, [r5 + 18 * 16]
14284 pmulhrsw m4, m3
14285 packuswb m4, m4
14286 movh [r0 + 317 * 16], m4
14287
14288 ; mode 21 [row 14 - first half]
14289 pmaddubsw m4, m6, [r5 + 1 * 16]
14290 pmulhrsw m4, m3
14291 packuswb m4, m4
14292 movh [r0 + 318 * 16], m4
14293
14294 ; mode 21 [row 15 - first half]
14295 pslldq m6, 2
14296 pinsrb m6, [r4 + 13], 1
14297 pinsrb m6, [r4 + 15], 0
14298 pmaddubsw m4, m6, [r5 + 16 * 16]
14299 pmulhrsw m4, m3
14300 packuswb m4, m4
14301 movh [r0 + 319 * 16], m4
14302
14303 ; mode 20 [row 13 - second half]
14304 pmaddubsw m4, m7, [r5 + 26 * 16]
14305 pmulhrsw m4, m3
14306 packuswb m4, m4
14307 movh [r0 + 301 * 16 + 8], m4
14308 ; mode 20 [row 13 - second half]
14309
14310 ; mode 20 [row 14 - second half]
14311 pmaddubsw m4, m7, [r5 + 5 * 16]
14312 pmulhrsw m4, m3
14313 packuswb m4, m4
14314 movh [r0 + 302 * 16 + 8], m4
14315 ; mode 20 [row 14 - second half]
14316
14317 ; mode 20 [row 3 - first half]
14318 pslldq m7, 2
14319 pinsrb m7, [r4 + 2], 1
14320 pinsrb m7, [r4 + 3], 0
14321 pmaddubsw m4, m7, [r5 + 12 * 16]
14322 pmulhrsw m4, m3
14323 packuswb m4, m4
14324 movh [r0 + 291 * 16], m4
14325
14326 ; mode 20 [row 15 - second half]
14327 pmaddubsw m4, m7, [r5 + 16 * 16]
14328 pmulhrsw m4, m3
14329 packuswb m4, m4
14330 movh [r0 + 303 * 16 + 8], m4
14331 ; mode 20 [row 15 - second half]
14332
14333 ; mode 20 [row 4 - first half]
14334 pslldq m7, 2
14335 pinsrb m7, [r4 + 3], 1
14336 pinsrb m7, [r4 + 5], 0
14337 pmaddubsw m4, m7, [r5 + 23 * 16]
14338 pmulhrsw m4, m3
14339 packuswb m4, m4
14340 movh [r0 + 292 * 16], m4
14341
14342 ; mode 20 [row 5 - first half]
14343 pmaddubsw m4, m7, [r5 + 2 * 16]
14344 pmulhrsw m4, m3
14345 packuswb m4, m4
14346 movh [r0 + 293 * 16], m4
14347
14348 ; mode 20 [row 6 - first half]
14349 pslldq m7, 2
14350 pinsrb m7, [r4 + 5], 1
14351 pinsrb m7, [r4 + 6], 0
14352 pmaddubsw m4, m7, [r5 + 13 * 16]
14353 pmulhrsw m4, m3
14354 packuswb m4, m4
14355 movh [r0 + 294 * 16], m4
14356
14357 ; mode 20 [row 7 - first half]
14358 pslldq m7, 2
14359 pinsrb m7, [r4 + 6], 1
14360 pinsrb m7, [r4 + 8], 0
14361 pmaddubsw m4, m7, [r5 + 24 * 16]
14362 pmulhrsw m4, m3
14363 packuswb m4, m4
14364 movh [r0 + 295 * 16], m4
14365
14366 ; mode 20 [row 8 - first half]
14367 pmaddubsw m4, m7, [r5 + 3 * 16]
14368 pmulhrsw m4, m3
14369 packuswb m4, m4
14370 movh [r0 + 296 * 16], m4
14371
14372 ; mode 20 [row 9 - first half]
14373 pslldq m7, 2
14374 pinsrb m7, [r4 + 8], 1
14375 pinsrb m7, [r4 + 9], 0
14376 pmaddubsw m4, m7, [r5 + 14 * 16]
14377 pmulhrsw m4, m3
14378 packuswb m4, m4
14379 movh [r0 + 297 * 16], m4
14380
14381 ; mode 20 [row 10 - first half]
14382 pslldq m7, 2
14383 pinsrb m7, [r4 + 9], 1
14384 pinsrb m7, [r4 + 11], 0
14385 pmaddubsw m4, m7, [r5 + 25 * 16]
14386 pmulhrsw m4, m3
14387 packuswb m4, m4
14388 movh [r0 + 298 * 16], m4
14389
14390 ; mode 20 [row 11 - first half]
14391 pmaddubsw m4, m7, [r5 + 4 * 16]
14392 pmulhrsw m4, m3
14393 packuswb m4, m4
14394 movh [r0 + 299 * 16], m4
14395
14396 ; mode 20 [row 12 - first half]
14397 movu m1, [r5 + 15 * 16]
14398 pslldq m7, 2
14399 pinsrb m7, [r4 + 11], 1
14400 pinsrb m7, [r4 + 12], 0
14401 pmaddubsw m4, m7, [r5 + 15 * 16]
14402 pmulhrsw m4, m3
14403 packuswb m4, m4
14404 movh [r0 + 300 * 16], m4
14405
14406 ; mode 20 [row 13 - first half]
14407 pslldq m7, 2
14408 pinsrb m7, [r4 + 12], 1
14409 pinsrb m7, [r4 + 14], 0
14410 pmaddubsw m4, m7, [r5 + 26 * 16]
14411 pmulhrsw m4, m3
14412 packuswb m4, m4
14413 movh [r0 + 301 * 16], m4
14414
14415 ; mode 20 [row 14 - first half]
14416 pmaddubsw m4, m7, [r5 + 5 * 16]
14417 pmulhrsw m4, m3
14418 packuswb m4, m4
14419 movh [r0 + 302 * 16], m4
14420
14421 ; mode 20 [row 15 - first half]
14422 pslldq m7, 2
14423 pinsrb m7, [r4 + 14], 1
14424 pinsrb m7, [r4 + 15], 0
14425 pmaddubsw m4, m7, [r5 + 16 * 16]
14426 pmulhrsw m4, m3
14427 packuswb m4, m4
14428 movh [r0 + 303 * 16], m4
14429
14430 ; mode 19 [row 1]
14431 pslldq m0, 2
14432 pinsrb m0, [r4 + 0], 1
14433 pinsrb m0, [r4 + 1], 0
14434 pslldq m5, 2
14435 pinsrb m5, [r3 + 8], 1
14436 pinsrb m5, [r3 + 7], 0
14437
14438 ; mode 20 [row 1 - second half]
14439 pmaddubsw m4, m5, [r5 + 22 * 16]
14440 pmulhrsw m4, m3
14441 packuswb m4, m4
14442 movh [r0 + 289 * 16 + 8], m4
14443 ; mode 20 [row 1 - second half] end
14444
14445 ; mode 20 [row 2 - second half]
14446 pmaddubsw m4, m5, [r5 + 1 * 16]
14447 pmulhrsw m4, m3
14448 packuswb m4, m4
14449 movh [r0 + 290 * 16 + 8], m4
14450 ; mode 20 [row 2 - second half] end
14451
14452 ; mode 21 [row 2 - second half]
14453 pmaddubsw m4, m5, [r5 + 30 * 16]
14454 pmulhrsw m4, m3
14455 packuswb m4, m4
14456 movh [r0 + 305 * 16 + 8], m4
14457 ; mode 21 [row 2 - second half] end
14458
14459 ; mode 21 [row 3 - second half]
14460 pmaddubsw m4, m5, [r5 + 13 * 16]
14461 pmulhrsw m4, m3
14462 packuswb m4, m4
14463 movh [r0 + 306 * 16 + 8], m4
14464 ; mode 21 [row 3 - second half] end
14465
14466 ; mode 21 [row 4 - second half]
14467 pmaddubsw m4, m5, [r5 + 11 * 16]
14468 pmulhrsw m4, m3
14469 packuswb m4, m4
14470 movh [r0 + 307 * 16 + 8], m4
14471 ; mode 21 [row 4 - second half] end
14472
14473 ; mode 22 [row 2 - second half]
14474 pmaddubsw m4, m5, [r5 + 25 * 16]
14475 pmulhrsw m4, m3
14476 packuswb m4, m4
14477 movh [r0 + 322 * 16 + 8], m4
14478 ; mode 22 [row 2 - second half] end
14479
14480 ; mode 22 [row 3 - second half]
14481 pmaddubsw m4, m5, [r5 + 12 * 16]
14482 pmulhrsw m4, m3
14483 packuswb m4, m4
14484 movh [r0 + 323 * 16 + 8], m4
14485 ; mode 22 [row 3 - second half] end
14486
14487 ; mode 23 [row 3 - second half]
14488 pmaddubsw m4, m5, [r5 + 28 * 16]
14489 pmulhrsw m4, m3
14490 packuswb m4, m4
14491 movh [r0 + 339 * 16 + 8], m4
14492 ; mode 23 [row 3 - second half] end
14493
14494 ; mode 23 [row 4 - second half]
14495 pmaddubsw m4, m5, [r5 + 19 * 16]
14496 pmulhrsw m4, m3
14497 packuswb m4, m4
14498 movh [r0 + 340 * 16 + 8], m4
14499 ; mode 23 [row 4 - second half] end
14500
14501 ; mode 23 [row 5 - second half]
14502 pmaddubsw m4, m5, [r5 + 10 * 16]
14503 pmulhrsw m4, m3
14504 packuswb m4, m4
14505 movh [r0 + 341 * 16 + 8], m4
14506 ; mode 23 [row 5 - second half] end
14507
14508 ; mode 23 [row 6 - second half]
14509 pmaddubsw m4, m5, [r5 + 1 * 16]
14510 pmulhrsw m4, m3
14511 packuswb m4, m4
14512 movh [r0 + 342 * 16 + 8], m4
14513 ; mode 23 [row 6 - second half] end
14514
14515 ; mode 24 [row 6 - second half]
14516 pmaddubsw m4, m5, [r5 + 29 * 16]
14517 pmulhrsw m4, m3
14518 packuswb m4, m4
14519 movh [r0 + 358 * 16 + 8], m4
14520 ; mode 24 [row 6 - second half] end
14521
14522 ; mode 24 [row 7 - second half]
14523 pmaddubsw m4, m5, [r5 + 24 * 16]
14524 pmulhrsw m4, m3
14525 packuswb m4, m4
14526 movh [r0 + 359 * 16 + 8], m4
14527 ; mode 24 [row 7 - second half] end
14528
14529 ; mode 24 [row 8 - second half]
14530 pmaddubsw m4, m5, [r5 + 19 * 16]
14531 pmulhrsw m4, m3
14532 packuswb m4, m4
14533 movh [r0 + 360 * 16 + 8], m4
14534 ; mode 24 [row 8 - second half] end
14535
14536 ; mode 24 [row 9 - second half]
14537 pmaddubsw m4, m5, [r5 + 14 * 16]
14538 pmulhrsw m4, m3
14539 packuswb m4, m4
14540 movh [r0 + 361 * 16 + 8], m4
14541 ; mode 24 [row 9 - second half] end
14542
14543 ; mode 24 [row 10 - second half]
14544 pmaddubsw m4, m5, [r5 + 9 * 16]
14545 pmulhrsw m4, m3
14546 packuswb m4, m4
14547 movh [r0 + 362 * 16 + 8], m4
14548 ; mode 24 [row 10 - second half] end
14549
14550 ; mode 24 [row 11 - second half]
14551 pmaddubsw m4, m5, [r5 + 4 * 16]
14552 pmulhrsw m4, m3
14553 packuswb m4, m4
14554 movh [r0 + 363 * 16 + 8], m4
14555 ; mode 24 [row 11 - second half] end
14556
14557 pmaddubsw m4, m0, [r5 + 12 * 16]
14558 pmulhrsw m4, m3
14559 pmaddubsw m6, m5, [r5 + 12 * 16]
14560 pmulhrsw m6, m3
14561 packuswb m4, m6
14562 movu [r0 + 273 * 16], m4
14563
14564 ; mode 19 [row 2]
14565 pslldq m0, 2
14566 pinsrb m0, [r4 + 1], 1
14567 pinsrb m0, [r4 + 2], 0
14568 pslldq m5, 2
14569 pinsrb m5, [r3 + 7], 1
14570 pinsrb m5, [r3 + 6], 0
14571
14572 ; mode 20 [row 3 - second half]
14573 pmaddubsw m4, m5, [r5 + 12 * 16]
14574 pmulhrsw m4, m3
14575 packuswb m4, m4
14576 movh [r0 + 291 * 16 + 8], m4
14577 ; mode 20 [row 3 - second half] end
14578
14579 ; mode 21 [row 3 - second half]
14580 pmaddubsw m4, m5, [r5 + 28 * 16]
14581 pmulhrsw m4, m3
14582 packuswb m4, m4
14583 movh [r0 + 307 * 16 + 8], m4
14584 ; mode 21 [row 3 - second half] end
14585
14586 ; mode 21 [row 4 - second half]
14587 pmaddubsw m4, m5, [r5 + 11 * 16]
14588 pmulhrsw m4, m3
14589 packuswb m4, m4
14590 movh [r0 + 308 * 16 + 8], m4
14591 ; mode 21 [row 4 - second half] end
14592
14593 ; mode 22 [row 4 - second half]
14594 pmaddubsw m4, m5, [r5 + 31 * 16]
14595 pmulhrsw m4, m3
14596 packuswb m4, m4
14597 movh [r0 + 324 * 16 + 8], m4
14598 ; mode 22 [row 4 - second half] end
14599
14600 ; mode 22 [row 5 - second half]
14601 pmaddubsw m4, m5, [r5 + 18 * 16]
14602 pmulhrsw m4, m3
14603 packuswb m4, m4
14604 movh [r0 + 325 * 16 + 8], m4
14605 ; mode 22 [row 5 - second half] end
14606
14607 ; mode 22 [row 6 - second half]
14608 pmaddubsw m4, m5, [r5 + 5 * 16]
14609 pmulhrsw m4, m3
14610 packuswb m4, m4
14611 movh [r0 + 326 * 16 + 8], m4
14612 ; mode 22 [row 6 - second half] end
14613
14614 ; mode 23 [row 7 - second half]
14615 pmaddubsw m4, m5, [r5 + 24 * 16]
14616 pmulhrsw m4, m3
14617 packuswb m4, m4
14618 movh [r0 + 343 * 16 + 8], m4
14619 ; mode 23 [row 7 - second half] end
14620
14621 ; mode 23 [row 8 - second half]
14622 pmaddubsw m4, m5, [r5 + 15 * 16]
14623 pmulhrsw m4, m3
14624 packuswb m4, m4
14625 movh [r0 + 344 * 16 + 8], m4
14626 ; mode 23 [row 8 - second half] end
14627
14628 ; mode 23 [row 9 - second half]
14629 pmaddubsw m4, m5, [r5 + 6 * 16]
14630 pmulhrsw m4, m3
14631 packuswb m4, m4
14632 movh [r0 + 345 * 16 + 8], m4
14633 ; mode 23 [row 9 - second half] end
14634
14635 ; mode 24 [row 12 - second half]
14636 pmaddubsw m4, m5, [r5 + 31 * 16]
14637 pmulhrsw m4, m3
14638 packuswb m4, m4
14639 movh [r0 + 364 * 16 + 8], m4
14640 ; mode 24 [row 12 - second half] end
14641
14642 ; mode 24 [row 13 - second half]
14643 pmaddubsw m4, m5, [r5 + 26 * 16]
14644 pmulhrsw m4, m3
14645 packuswb m4, m4
14646 movh [r0 + 365 * 16 + 8], m4
14647 ; mode 24 [row 13 - second half] end
14648
14649 ; mode 24 [row 14 - second half]
14650 pmaddubsw m4, m5, [r5 + 21 * 16]
14651 pmulhrsw m4, m3
14652 packuswb m4, m4
14653 movh [r0 + 366 * 16 + 8], m4
14654 ; mode 24 [row 14 - second half] end
14655
14656 ; mode 24 [row 15 - second half]
14657 pmaddubsw m4, m5, [r5 + 16 * 16]
14658 pmulhrsw m4, m3
14659 packuswb m4, m4
14660 movh [r0 + 367 * 16 + 8], m4
14661 ; mode 24 [row 15 - second half] end
14662
14663 pmaddubsw m4, m0, [r5 + 18 * 16]
14664 pmulhrsw m4, m3
14665 pmaddubsw m6, m5, [r5 + 18 * 16]
14666 pmulhrsw m6, m3
14667 packuswb m4, m6
14668 movu [r0 + 274 * 16], m4
14669
14670 ; mode 19 [row 3]
14671 pslldq m0, 2
14672 pinsrb m0, [r4 + 2], 1
14673 pinsrb m0, [r4 + 4], 0
14674 pslldq m5, 2
14675 pinsrb m5, [r3 + 6], 1
14676 pinsrb m5, [r3 + 5], 0
14677
14678 ; mode 20 [row 4 - second half]
14679 pmaddubsw m4, m5, [r5 + 23 * 16]
14680 pmulhrsw m4, m3
14681 packuswb m4, m4
14682 movh [r0 + 292 * 16 + 8], m4
14683 ; mode 20 [row 4 - second half] end
14684
14685 ; mode 20 [row 5 - second half]
14686 pmaddubsw m4, m5, [r5 + 2 * 16]
14687 pmulhrsw m4, m3
14688 packuswb m4, m4
14689 movh [r0 + 293 * 16 + 8], m4
14690 ; mode 20 [row 5 - second half] end
14691
14692 ; mode 21 [row 5 - second half]
14693 pmaddubsw m4, m5, [r5 + 26 * 16]
14694 pmulhrsw m4, m3
14695 packuswb m4, m4
14696 movh [r0 + 309 * 16 + 8], m4
14697 ; mode 21 [row 5 - second half] end
14698
14699 ; mode 21 [row 6 - second half]
14700 pmaddubsw m4, m5, [r5 + 9 * 16]
14701 pmulhrsw m4, m3
14702 packuswb m4, m4
14703 movh [r0 + 310 * 16 + 8], m4
14704 ; mode 21 [row 6 - second half] end
14705
14706 ; mode 22 [row 7 - second half]
14707 pmaddubsw m4, m5, [r5 + 24 * 16]
14708 pmulhrsw m4, m3
14709 packuswb m4, m4
14710 movh [r0 + 327 * 16 + 8], m4
14711 ; mode 22 [row 7 - second half] end
14712
14713 ; mode 22 [row 8 - second half]
14714 pmaddubsw m4, m5, [r5 + 11 * 16]
14715 pmulhrsw m4, m3
14716 packuswb m4, m4
14717 movh [r0 + 328 * 16 + 8], m4
14718 ; mode 22 [row 7 - second half] end
14719
14720 ; mode 23 [row 10 - second half]
14721 pmaddubsw m4, m5, [r5 + 29 * 16]
14722 pmulhrsw m4, m3
14723 packuswb m4, m4
14724 movh [r0 + 346 * 16 + 8], m4
14725 ; mode 23 [row 10 - second half] end
14726
14727 ; mode 23 [row 11 - second half]
14728 pmaddubsw m4, m5, [r5 + 20 * 16]
14729 pmulhrsw m4, m3
14730 packuswb m4, m4
14731 movh [r0 + 347 * 16 + 8], m4
14732 ; mode 23 [row 11 - second half] end
14733
14734 ; mode 23 [row 12 - second half]
14735 pmaddubsw m4, m5, [r5 + 11 * 16]
14736 pmulhrsw m4, m3
14737 packuswb m4, m4
14738 movh [r0 + 348 * 16 + 8], m4
14739 ; mode 23 [row 12 - second half] end
14740
14741 ; mode 23 [row 13 - second half]
14742 pmaddubsw m4, m5, [r5 + 2 * 16]
14743 pmulhrsw m4, m3
14744 packuswb m4, m4
14745 movh [r0 + 349 * 16 + 8], m4
14746 ; mode 23 [row 13 - second half] end
14747
14748 pmaddubsw m4, m0, [r5 + 24 * 16]
14749 pmulhrsw m4, m3
14750 pmaddubsw m6, m5, [r5 + 24 * 16]
14751 pmulhrsw m6, m3
14752 packuswb m4, m6
14753 movu [r0 + 275 * 16], m4
14754
14755 ; mode 19 [row 4]
14756 pslldq m0, 2
14757 pinsrb m0, [r4 + 4], 1
14758 pinsrb m0, [r4 + 5], 0
14759 pslldq m5, 2
14760 pinsrb m5, [r3 + 5], 1
14761 pinsrb m5, [r3 + 4], 0
14762
14763 ; mode 20 [row 6 - second half]
14764 pmaddubsw m4, m5, [r5 + 13 * 16]
14765 pmulhrsw m4, m3
14766 packuswb m4, m4
14767 movh [r0 + 294 * 16 + 8], m4
14768 ; mode 20 [row 6 - second half] end
14769
14770 ; mode 21 [row 7 - second half]
14771 pmaddubsw m4, m5, [r5 + 24 * 16]
14772 pmulhrsw m4, m3
14773 packuswb m4, m4
14774 movh [r0 + 311 * 16 + 8], m4
14775 ; mode 21 [row 7 - second half] end
14776
14777 ; mode 21 [row 8 - second half]
14778 pmaddubsw m4, m5, [r5 + 7 * 16]
14779 pmulhrsw m4, m3
14780 packuswb m4, m4
14781 movh [r0 + 312 * 16 + 8], m4
14782 ; mode 21 [row 8 - second half] end
14783
14784 ; mode 22 [row 9 - second half]
14785 pmaddubsw m4, m5, [r5 + 30 * 16]
14786 pmulhrsw m4, m3
14787 packuswb m4, m4
14788 movh [r0 + 329 * 16 + 8], m4
14789 ; mode 22 [row 9 - second half] end
14790
14791 ; mode 22 [row 10 - second half]
14792 pmaddubsw m4, m5, [r5 + 17 * 16]
14793 pmulhrsw m4, m3
14794 packuswb m4, m4
14795 movh [r0 + 330 * 16 + 8], m4
14796 ; mode 22 [row 10 - second half] end
14797
14798 ; mode 22 [row 11 - second half]
14799 pmaddubsw m4, m5, [r5 + 4 * 16]
14800 pmulhrsw m4, m3
14801 packuswb m4, m4
14802 movh [r0 + 331 * 16 + 8], m4
14803 ; mode 22 [row 11 - second half] end
14804
14805 ; mode 23 [row 14 - second half]
14806 pmaddubsw m4, m5, [r5 + 25 * 16]
14807 pmulhrsw m4, m3
14808 packuswb m4, m4
14809 movh [r0 + 350 * 16 + 8], m4
14810 ; mode 23 [row 14 - second half] end
14811
14812 ; mode 23 [row 15 - second half]
14813 pmaddubsw m4, m5, [r5 + 16 * 16]
14814 pmulhrsw m4, m3
14815 packuswb m4, m4
14816 movh [r0 + 351 * 16 + 8], m4
14817
14818 ; mode 23 [row 15 - second half] end
14819 pmaddubsw m4, m0, [r5 + 30 * 16]
14820 pmulhrsw m4, m3
14821 pmaddubsw m6, m5, [r5 + 30 * 16]
14822 pmulhrsw m6, m3
14823 packuswb m4, m6
14824 movu [r0 + 276 * 16], m4
14825
14826 ; mode 19 [row 5]
14827 pmaddubsw m4, m0, [r5 + 4 * 16]
14828 pmulhrsw m4, m3
14829 pmaddubsw m6, m5, [r5 + 4 * 16]
14830 pmulhrsw m6, m3
14831 packuswb m4, m6
14832 movu [r0 + 277 * 16], m4
14833
14834 ; mode 19 [row 6]
14835 pslldq m0, 2
14836 pinsrb m0, [r4 + 5], 1
14837 pinsrb m0, [r4 + 6], 0
14838 pslldq m5, 2
14839 pinsrb m5, [r3 + 4], 1
14840 pinsrb m5, [r3 + 3], 0
14841
14842 ; mode 20 [row 7 - second half]
14843 pmaddubsw m4, m5, [r5 + 24 * 16]
14844 pmulhrsw m4, m3
14845 packuswb m4, m4
14846 movh [r0 + 295 * 16 + 8], m4
14847 ; mode 20 [row 7 - second half] end
14848
14849 ; mode 20 [row 8 - second half]
14850 pmaddubsw m4, m5, [r5 + 3 * 16]
14851 pmulhrsw m4, m3
14852 packuswb m4, m4
14853 movh [r0 + 296 * 16 + 8], m4
14854 ; mode 20 [row 8 - second half] end
14855
14856 ; mode 21 [row 9 - second half]
14857 pmaddubsw m4, m5, [r5 + 22 * 16]
14858 pmulhrsw m4, m3
14859 packuswb m4, m4
14860 movh [r0 + 313 * 16 + 8], m4
14861 ; mode 21 [row 9 - second half] end
14862
14863 ; mode 21 [row 10 - second half]
14864 pmaddubsw m4, m5, [r5 + 5 * 16]
14865 pmulhrsw m4, m3
14866 packuswb m4, m4
14867 movh [r0 + 314 * 16 + 8], m4
14868 ; mode 21 [row 10 - second half] end
14869
14870 ; mode 22 [row 12 - second half]
14871 pmaddubsw m4, m5, [r5 + 23 * 16]
14872 pmulhrsw m4, m3
14873 packuswb m4, m4
14874 movh [r0 + 332 * 16 + 8], m4
14875 ; mode 22 [row 12 - second half] end
14876
14877 ; mode 22 [row 12 - second half]
14878 pmaddubsw m4, m5, [r5 + 10 * 16]
14879 pmulhrsw m4, m3
14880 packuswb m4, m4
14881 movh [r0 + 333 * 16 + 8], m4
14882 ; mode 22 [row 12 - second half] end
14883
14884 pmaddubsw m4, m0, [r5 + 10 * 16]
14885 pmulhrsw m4, m3
14886 pmaddubsw m6, m5, [r5 + 10 * 16]
14887 pmulhrsw m6, m3
14888 packuswb m4, m6
14889 movu [r0 + 278 * 16], m4
14890
14891 ; mode 19 [row 7]
14892 pslldq m0, 2
14893 pinsrb m0, [r4 + 6], 1
14894 pinsrb m0, [r4 + 7], 0
14895 pslldq m5, 2
14896 pinsrb m5, [r3 + 3], 1
14897 pinsrb m5, [r3 + 2], 0
14898
14899 ; mode 20 [row 9 - second half]
14900 pmaddubsw m4, m5, [r5 + 14 * 16]
14901 pmulhrsw m4, m3
14902 packuswb m4, m4
14903 movh [r0 + 297 * 16 + 8], m4
14904 ; mode 20 [row 9 - second half]
14905
14906 ; mode 21 [row 11 - second half]
14907 pmaddubsw m4, m5, [r5 + 20 * 16]
14908 pmulhrsw m4, m3
14909 packuswb m4, m4
14910 movh [r0 + 315 * 16 + 8], m4
14911 ; mode 21 [row 11 - second half] end
14912
14913 ; mode 21 [row 12 - second half]
14914 pmaddubsw m4, m5, [r5 + 3 * 16]
14915 pmulhrsw m4, m3
14916 packuswb m4, m4
14917 movh [r0 + 316 * 16 + 8], m4
14918 ; mode 21 [row 12 - second half] end
14919
14920 ; mode 22 [row 14 - second half]
14921 pmaddubsw m4, m5, [r5 + 29 * 16]
14922 pmulhrsw m4, m3
14923 packuswb m4, m4
14924 movh [r0 + 334 * 16 + 8], m4
14925 ; mode 22 [row 14 - second half] end
14926
14927 ; mode 22 [row 15 - second half]
14928 pmaddubsw m4, m5, [r5 + 16 * 16]
14929 pmulhrsw m4, m3
14930 packuswb m4, m4
14931 movh [r0 + 335 * 16 + 8], m4
14932 ; mode 22 [row 15 - second half] end
14933
14934 pmaddubsw m4, m0, [r5 + 16 * 16]
14935 pmulhrsw m4, m3
14936 pmaddubsw m6, m5, [r5 + 16 * 16]
14937 pmulhrsw m6, m3
14938 packuswb m4, m6
14939 movu [r0 + 279 * 16], m4
14940
14941 ; mode 19 [row 8]
14942 pslldq m0, 2
14943 pinsrb m0, [r4 + 7], 1
14944 pinsrb m0, [r4 + 9], 0
14945 pslldq m5, 2
14946 pinsrb m5, [r3 + 2], 1
14947 pinsrb m5, [r3 + 1], 0
14948
14949 ; mode 20 [row 10 - second half]
14950 pmaddubsw m4, m5, [r5 + 25 * 16]
14951 pmulhrsw m4, m3
14952 packuswb m4, m4
14953 movh [r0 + 298 * 16 + 8], m4
14954 ; mode 20 [row 10 - second half] end
14955
14956 ; mode 20 [row 11 - second half]
14957 pmaddubsw m4, m5, [r5 + 4 * 16]
14958 pmulhrsw m4, m3
14959 packuswb m4, m4
14960 movh [r0 + 299 * 16 + 8], m4
14961 ; mode 20 [row 11 - second half] end
14962
14963 ; mode 21 [row 13 - second half]
14964 pmaddubsw m4, m5, [r5 + 18 * 16]
14965 pmulhrsw m4, m3
14966 packuswb m4, m4
14967 movh [r0 + 317 * 16 + 8], m4
14968 ; mode 21 [row 13 - second half] end
14969
14970 ; mode 21 [row 14 - second half]
14971 pmaddubsw m4, m5, [r5 + 1 * 16]
14972 pmulhrsw m4, m3
14973 packuswb m4, m4
14974 movh [r0 + 318 * 16 + 8], m4
14975 ; mode 21 [row 14 - second half] end
14976
14977 pmaddubsw m4, m0, [r5 + 22 * 16]
14978 pmulhrsw m4, m3
14979 pmaddubsw m6, m5, [r5 + 22 * 16]
14980 pmulhrsw m6, m3
14981 packuswb m4, m6
14982 movu [r0 + 280 * 16], m4
14983
14984 ; mode 19 [row 9]
14985 pslldq m0, 2
14986 pinsrb m0, [r4 + 9], 1
14987 pinsrb m0, [r4 + 10], 0
14988 pslldq m5, 2
14989 pinsrb m5, [r3 + 1], 1
14990 pinsrb m5, [r3 + 0], 0
14991
14992 ; mode 20 [row 12 - second half]
14993 pmaddubsw m4, m5, [r5 + 15 * 16]
14994 pmulhrsw m4, m3
14995 packuswb m4, m4
14996 movh [r0 + 300 * 16 + 8], m4
14997
14998 ; mode 20 [row 12 - second half] end
14999 pmaddubsw m4, m0, [r5 + 28 * 16]
15000 pmulhrsw m4, m3
15001 pmaddubsw m6, m5, [r5 + 28 * 16]
15002 pmulhrsw m6, m3
15003 packuswb m4, m6
15004 movu [r0 + 281 * 16], m4
15005
15006 ; mode 19 [row 10]
15007 pmaddubsw m4, m0, [r5 + 2 * 16]
15008 pmulhrsw m4, m3
15009 pmaddubsw m6, m5, [r5 + 2 * 16]
15010 pmulhrsw m6, m3
15011 packuswb m4, m6
15012 movu [r0 + 282 * 16], m4
15013
15014 ; mode 19 [row 11]
15015 pslldq m0, 2
15016 pinsrb m0, [r4 + 10], 1
15017 pinsrb m0, [r4 + 11], 0
15018 pmaddubsw m4, m0, [r5 + 8 * 16]
15019 pmulhrsw m4, m3
15020 pslldq m5, 2
15021 pinsrb m5, [r4 + 0], 1
15022 pinsrb m5, [r4 + 1], 0
15023 pmaddubsw m6, m5, [r5 + 8 * 16]
15024 pmulhrsw m6, m3
15025 packuswb m4, m6
15026 movu [r0 + 283 * 16], m4
15027
15028 ; mode 19 [row 12]
15029 pslldq m0, 2
15030 pinsrb m0, [r4 + 11], 1
15031 pinsrb m0, [r4 + 12], 0
15032 pslldq m5, 2
15033 pinsrb m5, [r4 + 1], 1
15034 pinsrb m5, [r4 + 2], 0
15035 pmaddubsw m4, m0, [r5 + 14 * 16]
15036 pmulhrsw m4, m3
15037 pmaddubsw m6, m5, [r5 + 14 * 16]
15038 pmulhrsw m6, m3
15039 packuswb m4, m6
15040 movu [r0 + 284 * 16], m4
15041
15042 ; mode 19 [row 13]
15043 pslldq m0, 2
15044 pinsrb m0, [r4 + 12], 1
15045 pinsrb m0, [r4 + 14], 0
15046 pmaddubsw m4, m0, [r5 + 20 * 16]
15047 pmulhrsw m4, m3
15048 pslldq m5, 2
15049 pinsrb m5, [r4 + 2], 1
15050 pinsrb m5, [r4 + 4], 0
15051 pmaddubsw m6, m5, [r5 + 20 * 16]
15052 pmulhrsw m6, m3
15053 packuswb m4, m6
15054 movu [r0 + 285 * 16], m4
15055
15056 ; mode 19 [row 14]
15057 pslldq m0, 2
15058 pinsrb m0, [r4 + 14], 1
15059 pinsrb m0, [r4 + 15], 0
15060 pmaddubsw m4, m0, [r5 + 26 * 16]
15061 pmulhrsw m4, m3
15062 pslldq m5, 2
15063 pinsrb m5, [r4 + 4], 1
15064 pinsrb m5, [r4 + 5], 0
15065 pmaddubsw m6, m5, [r5 + 26 * 16]
15066 pmulhrsw m6, m3
15067 packuswb m4, m6
15068 movu [r0 + 286 * 16], m4
15069
15070 ; mode 19 [row 15]
15071 movu m0, [r4]
15072 pshufb m0, [tab_S1]
15073 movu [r0 + 287 * 16], m0
15074 movd m1, [r3]
15075 movd [r0 + 287 * 16 + 12], m1
15076
15077 ; mode 25
15078 movu m1, [r1]
15079
15080 ; mode 26 [all rows]
15081 psrldq m6, m1, 1
15082 pinsrb m6, [r1 + 16], 15
15083 movu m7, m6
15084 movu [r0 + 384 * 16], m6
15085 movu [r0 + 385 * 16], m6
15086 movu [r0 + 386 * 16], m6
15087 movu [r0 + 387 * 16], m6
15088 movu [r0 + 388 * 16], m6
15089 movu [r0 + 389 * 16], m6
15090 movu [r0 + 390 * 16], m6
15091 movu [r0 + 391 * 16], m6
15092 movu [r0 + 392 * 16], m6
15093 movu [r0 + 393 * 16], m6
15094 movu [r0 + 394 * 16], m6
15095 movu [r0 + 395 * 16], m6
15096 movu [r0 + 396 * 16], m6
15097 movu [r0 + 397 * 16], m6
15098 movu [r0 + 398 * 16], m6
15099 movu [r0 + 399 * 16], m6
15100
15101 pxor m0, m0
15102 pshufb m6, m6, m0
15103 punpcklbw m6, m0
15104 movu m2, [r2]
15105 pshufb m2, m2, m0
15106 punpcklbw m2, m0
15107 movu m4, [r2 + 1]
15108 punpcklbw m5, m4, m0
15109 punpckhbw m4, m0
15110 psubw m5, m2
15111 psubw m4, m2
15112 psraw m5, 1
15113 psraw m4, 1
15114 paddw m5, m6
15115 paddw m4, m6
15116 packuswb m5, m4
15117
15118 pextrb [r0 + 384 * 16], m5, 0
15119 pextrb [r0 + 385 * 16], m5, 1
15120 pextrb [r0 + 386 * 16], m5, 2
15121 pextrb [r0 + 387 * 16], m5, 3
15122 pextrb [r0 + 388 * 16], m5, 4
15123 pextrb [r0 + 389 * 16], m5, 5
15124 pextrb [r0 + 390 * 16], m5, 6
15125 pextrb [r0 + 391 * 16], m5, 7
15126 pextrb [r0 + 392 * 16], m5, 8
15127 pextrb [r0 + 393 * 16], m5, 9
15128 pextrb [r0 + 394 * 16], m5, 10
15129 pextrb [r0 + 395 * 16], m5, 11
15130 pextrb [r0 + 396 * 16], m5, 12
15131 pextrb [r0 + 397 * 16], m5, 13
15132 pextrb [r0 + 398 * 16], m5, 14
15133 pextrb [r0 + 399 * 16], m5, 15
15134
15135 ; mode 25 [row 15]
15136 movu [r0 + 383 * 16], m1
15137
15138 ; mode 25 [row 0]
15139 psrldq m2, m1, 1
15140 punpcklbw m1, m2
15141 movu m2, [r1 + 8]
15142 psrldq m4, m2, 1
15143 punpcklbw m2, m4
15144 pmaddubsw m4, m1, [r5 + 30 * 16]
15145 pmulhrsw m4, m3
15146 pmaddubsw m5, m2, [r5 + 30 * 16]
15147 pmulhrsw m5, m3
15148 packuswb m4, m5
15149 movu [r0 + 368 * 16], m4
15150
15151 ; mode 25 [row 1]
15152 pmaddubsw m4, m1, [r5 + 28 * 16]
15153 pmulhrsw m4, m3
15154 pmaddubsw m5, m2, [r5 + 28 * 16]
15155 pmulhrsw m5, m3
15156 packuswb m4, m5
15157 movu [r0 + 369 * 16], m4
15158
15159 ; mode 25 [row 2]
15160 pmaddubsw m4, m1, [r5 + 26 * 16]
15161 pmulhrsw m4, m3
15162 pmaddubsw m5, m2, [r5 + 26 * 16]
15163 pmulhrsw m5, m3
15164 packuswb m4, m5
15165 movu [r0 + 370 * 16], m4
15166
15167 ; mode 25 [row 3]
15168 pmaddubsw m4, m1, [r5 + 24 * 16]
15169 pmulhrsw m4, m3
15170 pmaddubsw m5, m2, [r5 + 24 * 16]
15171 pmulhrsw m5, m3
15172 packuswb m4, m5
15173 movu [r0 + 371 * 16], m4
15174
15175 ; mode 25 [row 4]
15176 pmaddubsw m4, m1, [r5 + 22 * 16]
15177 pmulhrsw m4, m3
15178 pmaddubsw m5, m2, [r5 + 22 * 16]
15179 pmulhrsw m5, m3
15180 packuswb m4, m5
15181 movu [r0 + 372 * 16], m4
15182
15183 ; mode 25 [row 5]
15184 pmaddubsw m4, m1, [r5 + 20 * 16]
15185 pmulhrsw m4, m3
15186 pmaddubsw m5, m2, [r5 + 20 * 16]
15187 pmulhrsw m5, m3
15188 packuswb m4, m5
15189 movu [r0 + 373 * 16], m4
15190
15191 ; mode 25 [row 6]
15192 pmaddubsw m4, m1, [r5 + 18 * 16]
15193 pmulhrsw m4, m3
15194 pmaddubsw m5, m2, [r5 + 18 * 16]
15195 pmulhrsw m5, m3
15196 packuswb m4, m5
15197 movu [r0 + 374 * 16], m4
15198
15199 ; mode 25 [row 7]
15200 pmaddubsw m4, m1, [r5 + 16 * 16]
15201 pmulhrsw m4, m3
15202 pmaddubsw m5, m2, [r5 + 16 * 16]
15203 pmulhrsw m5, m3
15204 packuswb m4, m5
15205 movu [r0 + 375 * 16], m4
15206
15207 ; mode 25 [row 8]
15208 pmaddubsw m4, m1, [r5 + 14 * 16]
15209 pmulhrsw m4, m3
15210 pmaddubsw m5, m2, [r5 + 14 * 16]
15211 pmulhrsw m5, m3
15212 packuswb m4, m5
15213 movu [r0 + 376 * 16], m4
15214
15215 ; mode 25 [row 9]
15216 pmaddubsw m4, m1, [r5 + 12 * 16]
15217 pmulhrsw m4, m3
15218 pmaddubsw m5, m2, [r5 + 12 * 16]
15219 pmulhrsw m5, m3
15220 packuswb m4, m5
15221 movu [r0 + 377 * 16], m4
15222
15223 ; mode 25 [row 10]
15224 pmaddubsw m4, m1, [r5 + 10 * 16]
15225 pmulhrsw m4, m3
15226 pmaddubsw m5, m2, [r5 + 10 * 16]
15227 pmulhrsw m5, m3
15228 packuswb m4, m5
15229 movu [r0 + 378 * 16], m4
15230
15231 ; mode 25 [row 11]
15232 pmaddubsw m4, m1, [r5 + 8 * 16]
15233 pmulhrsw m4, m3
15234 pmaddubsw m5, m2, [r5 + 8 * 16]
15235 pmulhrsw m5, m3
15236 packuswb m4, m5
15237 movu [r0 + 379 * 16], m4
15238
15239 ; mode 25 [row 12]
15240 pmaddubsw m4, m1, [r5 + 6 * 16]
15241 pmulhrsw m4, m3
15242 pmaddubsw m5, m2, [r5 + 6 * 16]
15243 pmulhrsw m5, m3
15244 packuswb m4, m5
15245 movu [r0 + 380 * 16], m4
15246
15247 ; mode 25 [row 13]
15248 pmaddubsw m4, m1, [r5 + 4 * 16]
15249 pmulhrsw m4, m3
15250 pmaddubsw m5, m2, [r5 + 4 * 16]
15251 pmulhrsw m5, m3
15252 packuswb m4, m5
15253 movu [r0 + 381 * 16], m4
15254
15255 ; mode 25 [row 14]
15256 pmaddubsw m4, m1, [r5 + 2 * 16]
15257 pmulhrsw m4, m3
15258 pmaddubsw m5, m2, [r5 + 2 * 16]
15259 pmulhrsw m5, m3
15260 packuswb m4, m5
15261 movu [r0 + 382 * 16], m4
15262
15263 ; mode 27 [row 15]
15264 psrldq m6, m7, 1
15265 punpcklbw m7, m6
15266 pinsrb m6, [r1 + 17], 15
15267 movu [r0 + 415 * 16], m6
15268
15269 ; mode 27 [row 0]
15270 movu m4, [r1 + 9]
15271 psrldq m5, m4, 1
15272 punpcklbw m4, m5
15273 pmaddubsw m6, m7, [r5 + 2 * 16]
15274 pmulhrsw m6, m3
15275 pmaddubsw m5, m4, [r5 + 2 * 16]
15276 pmulhrsw m5, m3
15277 packuswb m6, m5
15278 movu [r0 + 400 * 16], m6
15279
15280 ; mode 27 [row 1]
15281 pmaddubsw m6, m7, [r5 + 4 * 16]
15282 pmulhrsw m6, m3
15283 pmaddubsw m5, m4, [r5 + 4 * 16]
15284 pmulhrsw m5, m3
15285 packuswb m6, m5
15286 movu [r0 + 401 * 16], m6
15287
15288 ; mode 27 [row 2]
15289 pmaddubsw m6, m7, [r5 + 6 * 16]
15290 pmulhrsw m6, m3
15291 pmaddubsw m5, m4, [r5 + 6 * 16]
15292 pmulhrsw m5, m3
15293 packuswb m6, m5
15294 movu [r0 + 402 * 16], m6
15295
15296 ; mode 27 [row 3]
15297 pmaddubsw m6, m7, [r5 + 8 * 16]
15298 pmulhrsw m6, m3
15299 pmaddubsw m5, m4, [r5 + 8 * 16]
15300 pmulhrsw m5, m3
15301 packuswb m6, m5
15302 movu [r0 + 403 * 16], m6
15303
15304 ; mode 27 [row 4]
15305 pmaddubsw m6, m7, [r5 + 10 * 16]
15306 pmulhrsw m6, m3
15307 pmaddubsw m5, m4, [r5 + 10 * 16]
15308 pmulhrsw m5, m3
15309 packuswb m6, m5
15310 movu [r0 + 404 * 16], m6
15311
15312 ; mode 27 [row 5]
15313 pmaddubsw m6, m7, [r5 + 12 * 16]
15314 pmulhrsw m6, m3
15315 pmaddubsw m5, m4, [r5 + 12 * 16]
15316 pmulhrsw m5, m3
15317 packuswb m6, m5
15318 movu [r0 + 405 * 16], m6
15319
15320 ; mode 27 [row 6]
15321 pmaddubsw m6, m7, [r5 + 14 * 16]
15322 pmulhrsw m6, m3
15323 pmaddubsw m5, m4, [r5 + 14 * 16]
15324 pmulhrsw m5, m3
15325 packuswb m6, m5
15326 movu [r0 + 406 * 16], m6
15327
15328 ; mode 27 [row 7]
15329 pmaddubsw m6, m7, [r5 + 16 * 16]
15330 pmulhrsw m6, m3
15331 pmaddubsw m5, m4, [r5 + 16 * 16]
15332 pmulhrsw m5, m3
15333 packuswb m6, m5
15334 movu [r0 + 407 * 16], m6
15335
15336 ; mode 27 [row 8]
15337 pmaddubsw m6, m7, [r5 + 18 * 16]
15338 pmulhrsw m6, m3
15339 pmaddubsw m5, m4, [r5 + 18 * 16]
15340 pmulhrsw m5, m3
15341 packuswb m6, m5
15342 movu [r0 + 408 * 16], m6
15343
15344 ; mode 27 [row 9]
15345 pmaddubsw m6, m7, [r5 + 20 * 16]
15346 pmulhrsw m6, m3
15347 pmaddubsw m5, m4, [r5 + 20 * 16]
15348 pmulhrsw m5, m3
15349 packuswb m6, m5
15350 movu [r0 + 409 * 16], m6
15351
15352 ; mode 27 [row 10]
15353 pmaddubsw m6, m7, [r5 + 22 * 16]
15354 pmulhrsw m6, m3
15355 pmaddubsw m5, m4, [r5 + 22 * 16]
15356 pmulhrsw m5, m3
15357 packuswb m6, m5
15358 movu [r0 + 410 * 16], m6
15359
15360 ; mode 27 [row 11]
15361 pmaddubsw m6, m7, [r5 + 24 * 16]
15362 pmulhrsw m6, m3
15363 pmaddubsw m5, m4, [r5 + 24 * 16]
15364 pmulhrsw m5, m3
15365 packuswb m6, m5
15366 movu [r0 + 411 * 16], m6
15367
15368 ; mode 27 [row 12]
15369 pmaddubsw m6, m7, [r5 + 26 * 16]
15370 pmulhrsw m6, m3
15371 pmaddubsw m5, m4, [r5 + 26 * 16]
15372 pmulhrsw m5, m3
15373 packuswb m6, m5
15374 movu [r0 + 412 * 16], m6
15375
15376 ; mode 27 [row 13]
15377 pmaddubsw m6, m7, [r5 + 28 * 16]
15378 pmulhrsw m6, m3
15379 pmaddubsw m5, m4, [r5 + 28 * 16]
15380 pmulhrsw m5, m3
15381 packuswb m6, m5
15382 movu [r0 + 413 * 16], m6
15383
15384 ; mode 27 [row 14]
15385 pmaddubsw m6, m7, [r5 + 30 * 16]
15386 pmulhrsw m6, m3
15387 pmaddubsw m5, m4, [r5 + 30 * 16]
15388 pmulhrsw m5, m3
15389 packuswb m6, m5
15390 movu [r0 + 414 * 16], m6
15391
15392 ; mode 28 [row 0]
15393 movu m1, [r3 + 1]
15394 psrldq m2, m1, 1
15395 punpcklbw m1, m2
15396 movu m4, [r3 + 9]
15397 psrldq m5, m4, 1
15398 punpcklbw m4, m5
15399 pmaddubsw m2, m1, [r5 + 5 * 16]
15400 pmulhrsw m2, m3
15401 pmaddubsw m5, m4, [r5 + 5 * 16]
15402 pmulhrsw m5, m3
15403 packuswb m2, m5
15404 movu [r0 + 416 * 16], m2
15405
15406 ; mode 28 [row 0]
15407 pmaddubsw m2, m1, [r5 + 5 * 16]
15408 pmulhrsw m2, m3
15409 pmaddubsw m5, m4, [r5 + 5 * 16]
15410 pmulhrsw m5, m3
15411 packuswb m2, m5
15412 movu [r0 + 416 * 16], m2
15413
15414 ; mode 28 [row 1]
15415 pmaddubsw m2, m1, [r5 + 10 * 16]
15416 pmulhrsw m2, m3
15417 pmaddubsw m5, m4, [r5 + 10 * 16]
15418 pmulhrsw m5, m3
15419 packuswb m2, m5
15420 movu [r0 + 417 * 16], m2
15421
15422 ; mode 28 [row 2]
15423 pmaddubsw m2, m1, [r5 + 15 * 16]
15424 pmulhrsw m2, m3
15425 pmaddubsw m5, m4, [r5 + 15 * 16]
15426 pmulhrsw m5, m3
15427 packuswb m2, m5
15428 movu [r0 + 418 * 16], m2
15429
15430 ; mode 28 [row 3]
15431 pmaddubsw m2, m1, [r5 + 20 * 16]
15432 pmulhrsw m2, m3
15433 pmaddubsw m5, m4, [r5 + 20 * 16]
15434 pmulhrsw m5, m3
15435 packuswb m2, m5
15436 movu [r0 + 419 * 16], m2
15437
15438 ; mode 28 [row 4]
15439 pmaddubsw m2, m1, [r5 + 25 * 16]
15440 pmulhrsw m2, m3
15441 pmaddubsw m5, m4, [r5 + 25 * 16]
15442 pmulhrsw m5, m3
15443 packuswb m2, m5
15444 movu [r0 + 420 * 16], m2
15445
15446 ; mode 28 [row 5]
15447 pmaddubsw m2, m1, [r5 + 30 * 16]
15448 pmulhrsw m2, m3
15449 pmaddubsw m5, m4, [r5 + 30 * 16]
15450 pmulhrsw m5, m3
15451 packuswb m2, m5
15452 movu [r0 + 421 * 16], m2
15453
15454 ; mode 29 [row 0]
15455 pmaddubsw m2, m1, [r5 + 9 * 16]
15456 pmulhrsw m2, m3
15457 pmaddubsw m5, m4, [r5 + 9 * 16]
15458 pmulhrsw m5, m3
15459 packuswb m2, m5
15460 movu [r0 + 432 * 16], m2
15461
15462 ; mode 29 [row 1]
15463 pmaddubsw m2, m1, [r5 + 18 * 16]
15464 pmulhrsw m2, m3
15465 pmaddubsw m5, m4, [r5 + 18 * 16]
15466 pmulhrsw m5, m3
15467 packuswb m2, m5
15468 movu [r0 + 433 * 16], m2
15469
15470 ; mode 29 [row 2]
15471 pmaddubsw m2, m1, [r5 + 27 * 16]
15472 pmulhrsw m2, m3
15473 pmaddubsw m5, m4, [r5 + 27 * 16]
15474 pmulhrsw m5, m3
15475 packuswb m2, m5
15476 movu [r0 + 434 * 16], m2
15477
15478 ; mode 30 [row 0]
15479 pmaddubsw m2, m1, [r5 + 13 * 16]
15480 pmulhrsw m2, m3
15481 pmaddubsw m5, m4, [r5 + 13 * 16]
15482 pmulhrsw m5, m3
15483 packuswb m2, m5
15484 movu [r0 + 448 * 16], m2
15485
15486 ; mode 30 [row 1]
15487 pmaddubsw m2, m1, [r5 + 26 * 16]
15488 pmulhrsw m2, m3
15489 pmaddubsw m5, m4, [r5 + 26 * 16]
15490 pmulhrsw m5, m3
15491 packuswb m2, m5
15492 movu [r0 + 449 * 16], m2
15493
15494 ; mode 33 [row 0]
15495 movu [r0 + 496 * 16], m2
15496
15497 ; mode 31 [row 0]
15498 pmaddubsw m2, m1, [r5 + 17 * 16]
15499 pmulhrsw m2, m3
15500 pmaddubsw m5, m4, [r5 + 17 * 16]
15501 pmulhrsw m5, m3
15502 packuswb m2, m5
15503 movu [r0 + 464 * 16], m2
15504
15505 ; mode 32 [row 0]
15506 pmaddubsw m2, m1, [r5 + 21 * 16]
15507 pmulhrsw m2, m3
15508 pmaddubsw m5, m4, [r5 + 21 * 16]
15509 pmulhrsw m5, m3
15510 packuswb m2, m5
15511 movu [r0 + 480 * 16], m2
15512
15513 ; mode 28 [row 6]
15514 movd m7, [r3 + 9]
15515 palignr m7, m1, 2
15516 pmaddubsw m2, m7, [r5 + 3 * 16]
15517 pmulhrsw m2, m3
15518 movd m6, [r3 + 17]
15519 palignr m6, m4, 2
15520 pmaddubsw m5, m6, [r5 + 3 * 16]
15521 pmulhrsw m5, m3
15522 packuswb m2, m5
15523 movu [r0 + 422 * 16], m2
15524
15525 ; mode 28 [row 7]
15526 pmaddubsw m2, m7, [r5 + 8 * 16]
15527 pmulhrsw m2, m3
15528 pmaddubsw m5, m6, [r5 + 8 * 16]
15529 pmulhrsw m5, m3
15530 packuswb m2, m5
15531 movu [r0 + 423 * 16], m2
15532
15533 ; mode 28 [row 8]
15534 pmaddubsw m2, m7, [r5 + 13 * 16]
15535 pmulhrsw m2, m3
15536 pmaddubsw m5, m6, [r5 + 13 * 16]
15537 pmulhrsw m5, m3
15538 packuswb m2, m5
15539 movu [r0 + 424 * 16], m2
15540
15541 ; mode 28 [row 9]
15542 pmaddubsw m2, m7, [r5 + 18 * 16]
15543 pmulhrsw m2, m3
15544 pmaddubsw m5, m6, [r5 + 18 * 16]
15545 pmulhrsw m5, m3
15546 packuswb m2, m5
15547 movu [r0 + 425 * 16], m2
15548
15549 ; mode 28 [row 10]
15550 pmaddubsw m2, m7, [r5 + 23 * 16]
15551 pmulhrsw m2, m3
15552 pmaddubsw m5, m6, [r5 + 23 * 16]
15553 pmulhrsw m5, m3
15554 packuswb m2, m5
15555 movu [r0 + 426 * 16], m2
15556
15557 ; mode 29 [row 3]
15558 pmaddubsw m2, m7, [r5 + 4 * 16]
15559 pmulhrsw m2, m3
15560 pmaddubsw m5, m6, [r5 + 4 * 16]
15561 pmulhrsw m5, m3
15562 packuswb m2, m5
15563 movu [r0 + 435 * 16], m2
15564
15565 ; mode 29 [row 4]
15566 pmaddubsw m2, m7, [r5 + 13 * 16]
15567 pmulhrsw m2, m3
15568 pmaddubsw m5, m6, [r5 + 13 * 16]
15569 pmulhrsw m5, m3
15570 packuswb m2, m5
15571 movu [r0 + 436 * 16], m2
15572
15573 ; mode 29 [row 5]
15574 pmaddubsw m2, m7, [r5 + 22 * 16]
15575 pmulhrsw m2, m3
15576 pmaddubsw m5, m6, [r5 + 22 * 16]
15577 pmulhrsw m5, m3
15578 packuswb m2, m5
15579 movu [r0 + 437 * 16], m2
15580
15581 ; mode 29 [row 6]
15582 pmaddubsw m2, m7, [r5 + 31 * 16]
15583 pmulhrsw m2, m3
15584 pmaddubsw m5, m6, [r5 + 31 * 16]
15585 pmulhrsw m5, m3
15586 packuswb m2, m5
15587 movu [r0 + 438 * 16], m2
15588
15589 ; mode 32 [row 2]
15590 movu [r0 + 482 * 16], m2
15591
15592 ; mode 30 [row 2]
15593 pmaddubsw m2, m7, [r5 + 7 * 16]
15594 pmulhrsw m2, m3
15595 pmaddubsw m5, m6, [r5 + 7 * 16]
15596 pmulhrsw m5, m3
15597 packuswb m2, m5
15598 movu [r0 + 450 * 16], m2
15599
15600 ; mode 30 [row 3]
15601 pmaddubsw m2, m7, [r5 + 20 * 16]
15602 pmulhrsw m2, m3
15603 pmaddubsw m5, m6, [r5 + 20 * 16]
15604 pmulhrsw m5, m3
15605 packuswb m2, m5
15606 movu [r0 + 451 * 16], m2
15607
15608 ; mode 33 [row 1]
15609 movu [r0 + 497 * 16], m2
15610
15611 ; mode 31 [row 1]
15612 pmaddubsw m2, m7, [r5 + 2 * 16]
15613 pmulhrsw m2, m3
15614 pmaddubsw m5, m6, [r5 + 2 * 16]
15615 pmulhrsw m5, m3
15616 packuswb m2, m5
15617 movu [r0 + 465 * 16], m2
15618
15619 ; mode 31 [row 2]
15620 pmaddubsw m2, m7, [r5 + 19 * 16]
15621 pmulhrsw m2, m3
15622 pmaddubsw m5, m6, [r5 + 19 * 16]
15623 pmulhrsw m5, m3
15624 packuswb m2, m5
15625 movu [r0 + 466 * 16], m2
15626
15627 ; mode 32 [row 1]
15628 pmaddubsw m2, m7, [r5 + 10 * 16]
15629 pmulhrsw m2, m3
15630 pmaddubsw m5, m6, [r5 + 10 * 16]
15631 pmulhrsw m5, m3
15632 packuswb m2, m5
15633 movu [r0 + 481 * 16], m2
15634
15635 ; mode 28 [row 11]
15636 pmaddubsw m2, m7, [r5 + 28 * 16]
15637 pmulhrsw m2, m3
15638 pmaddubsw m5, m6, [r5 + 28 * 16]
15639 pmulhrsw m5, m3
15640 packuswb m2, m5
15641 movu [r0 + 427 * 16], m2
15642
15643 ; mode 28 [row 12]
15644 movd m1, [r3 + 10]
15645 palignr m1, m7, 2
15646 pmaddubsw m2, m1, [r5 + 1 * 16]
15647 pmulhrsw m2, m3
15648 movd m4, [r3 + 18]
15649 palignr m4, m6, 2
15650 pmaddubsw m5, m4, [r5 + 1 * 16]
15651 pmulhrsw m5, m3
15652 packuswb m2, m5
15653 movu [r0 + 428 * 16], m2
15654
15655 ; mode 30 [row 4]
15656 movu [r0 + 452 * 16], m2
15657
15658 ; mode 28 [row 13]
15659 pmaddubsw m2, m1, [r5 + 6 * 16]
15660 pmulhrsw m2, m3
15661 pmaddubsw m5, m4, [r5 + 6 * 16]
15662 pmulhrsw m5, m3
15663 packuswb m2, m5
15664 movu [r0 + 429 * 16], m2
15665
15666 ; mode 28 [row 14]
15667 pmaddubsw m2, m1, [r5 + 11 * 16]
15668 pmulhrsw m2, m3
15669 pmaddubsw m5, m4, [r5 + 11 * 16]
15670 pmulhrsw m5, m3
15671 packuswb m2, m5
15672 movu [r0 + 430 * 16], m2
15673
15674 ; mode 28 [row 15]
15675 pmaddubsw m2, m1, [r5 + 16 * 16]
15676 pmulhrsw m2, m3
15677 pmaddubsw m5, m4, [r5 + 16 * 16]
15678 pmulhrsw m5, m3
15679 packuswb m2, m5
15680 movu [r0 + 431 * 16], m2
15681
15682 ; mode 29 [row 7]
15683 pmaddubsw m2, m1, [r5 + 8 * 16]
15684 pmulhrsw m2, m3
15685 pmaddubsw m5, m4, [r5 + 8 * 16]
15686 pmulhrsw m5, m3
15687 packuswb m2, m5
15688 movu [r0 + 439 * 16], m2
15689
15690 ; mode 29 [row 8]
15691 pmaddubsw m2, m1, [r5 + 17 * 16]
15692 pmulhrsw m2, m3
15693 pmaddubsw m5, m4, [r5 + 17 * 16]
15694 pmulhrsw m5, m3
15695 packuswb m2, m5
15696 movu [r0 + 440 * 16], m2
15697
15698 ; mode 29 [row 9]
15699 pmaddubsw m2, m1, [r5 + 26 * 16]
15700 pmulhrsw m2, m3
15701 pmaddubsw m5, m4, [r5 + 26 * 16]
15702 pmulhrsw m5, m3
15703 packuswb m2, m5
15704 movu [r0 + 441 * 16], m2
15705
15706 ; mode 30 [row 5]
15707 pmaddubsw m2, m1, [r5 + 14 * 16]
15708 pmulhrsw m2, m3
15709 pmaddubsw m5, m4, [r5 + 14 * 16]
15710 pmulhrsw m5, m3
15711 packuswb m2, m5
15712 movu [r0 + 453 * 16], m2
15713
15714 ; mode 33 [row 2]
15715 movu [r0 + 498 * 16], m2
15716
15717 ; mode 30 [row 6]
15718 pmaddubsw m2, m1, [r5 + 27 * 16]
15719 pmulhrsw m2, m3
15720 pmaddubsw m5, m4, [r5 + 27 * 16]
15721 pmulhrsw m5, m3
15722 packuswb m2, m5
15723 movu [r0 + 454 * 16], m2
15724
15725 ; mode 31 [row 3]
15726 pmaddubsw m2, m1, [r5 + 4 * 16]
15727 pmulhrsw m2, m3
15728 pmaddubsw m5, m4, [r5 + 4 * 16]
15729 pmulhrsw m5, m3
15730 packuswb m2, m5
15731 movu [r0 + 467 * 16], m2
15732
15733 ; mode 31 [row 4]
15734 pmaddubsw m2, m1, [r5 + 21 * 16]
15735 pmulhrsw m2, m3
15736 pmaddubsw m5, m4, [r5 + 21 * 16]
15737 pmulhrsw m5, m3
15738 packuswb m2, m5
15739 movu [r0 + 468 * 16], m2
15740
15741 ; mode 32 [row 3]
15742 pmaddubsw m2, m1, [r5 + 20 * 16]
15743 pmulhrsw m2, m3
15744 pmaddubsw m5, m4, [r5 + 20 * 16]
15745 pmulhrsw m5, m3
15746 packuswb m2, m5
15747 movu [r0 + 483 * 16], m2
15748
15749 ; mode 29 [row 10]
15750 movd m7, [r3 + 11]
15751 palignr m7, m1, 2
15752 pmaddubsw m2, m7, [r5 + 3 * 16]
15753 pmulhrsw m2, m3
15754 movd m6, [r3 + 19]
15755 palignr m6, m4, 2
15756 pmaddubsw m5, m6, [r5 + 3 * 16]
15757 pmulhrsw m5, m3
15758 packuswb m2, m5
15759 movu [r0 + 442 * 16], m2
15760
15761 ; mode 29 [row 11]
15762 pmaddubsw m2, m7, [r5 + 12 * 16]
15763 pmulhrsw m2, m3
15764 pmaddubsw m5, m6, [r5 + 12 * 16]
15765 pmulhrsw m5, m3
15766 packuswb m2, m5
15767 movu [r0 + 443 * 16], m2
15768
15769 ; mode 29 [row 12]
15770 pmaddubsw m2, m7, [r5 + 21 * 16]
15771 pmulhrsw m2, m3
15772 pmaddubsw m5, m6, [r5 + 21 * 16]
15773 pmulhrsw m5, m3
15774 packuswb m2, m5
15775 movu [r0 + 444 * 16], m2
15776
15777 ; mode 30 [row 8]
15778 movu [r0 + 456 * 16], m2
15779
15780 ; mode 29 [row 13]
15781 pmaddubsw m2, m7, [r5 + 30 * 16]
15782 pmulhrsw m2, m3
15783 pmaddubsw m5, m6, [r5 + 30 * 16]
15784 pmulhrsw m5, m3
15785 packuswb m2, m5
15786 movu [r0 + 445 * 16], m2
15787
15788 ; mode 32 [row 5]
15789 movu [r0 + 485 * 16], m2
15790
15791 ; mode 30 [row 7]
15792 pmaddubsw m2, m7, [r5 + 8 * 16]
15793 pmulhrsw m2, m3
15794 pmaddubsw m5, m6, [r5 + 8 * 16]
15795 pmulhrsw m5, m3
15796 packuswb m2, m5
15797 movu [r0 + 455 * 16], m2
15798
15799 ; mode 33 [row 3]
15800 movu [r0 + 499 * 16], m2
15801
15802 ; mode 31 [row 5]
15803 pmaddubsw m2, m7, [r5 + 6 * 16]
15804 pmulhrsw m2, m3
15805 pmaddubsw m5, m6, [r5 + 6 * 16]
15806 pmulhrsw m5, m3
15807 packuswb m2, m5
15808 movu [r0 + 469 * 16], m2
15809
15810 ; mode 31 [row 6]
15811 pmaddubsw m2, m7, [r5 + 23 * 16]
15812 pmulhrsw m2, m3
15813 pmaddubsw m5, m6, [r5 + 23 * 16]
15814 pmulhrsw m5, m3
15815 packuswb m2, m5
15816 movu [r0 + 470 * 16], m2
15817
15818 ; mode 32 [row 4]
15819 pmaddubsw m2, m7, [r5 + 9 * 16]
15820 pmulhrsw m2, m3
15821 pmaddubsw m5, m6, [r5 + 9 * 16]
15822 pmulhrsw m5, m3
15823 packuswb m2, m5
15824 movu [r0 + 484 * 16], m2
15825
15826 movu m1, m7
15827 movu m4, m6
15828
15829 ; mode 29 [row 14]
15830 movu m1, [r3 + 12]
15831 palignr m1, m7, 2
15832 pmaddubsw m2, m1, [r5 + 7 * 16]
15833 pmulhrsw m2, m3
15834 movd m4, [r3 + 20]
15835 palignr m4, m6, 2
15836 pmaddubsw m5, m4, [r5 + 7 * 16]
15837 pmulhrsw m5, m3
15838 packuswb m2, m5
15839 movu [r0 + 446 * 16], m2
15840
15841 ; mode 29 [row 15]
15842 pmaddubsw m2, m1, [r5 + 16 * 16]
15843 pmulhrsw m2, m3
15844 pmaddubsw m5, m4, [r5 + 16 * 16]
15845 pmulhrsw m5, m3
15846 packuswb m2, m5
15847 movu [r0 + 447 * 16], m2
15848
15849 ; mode 30 [row 9]
15850 pmaddubsw m2, m1, [r5 + 2 * 16]
15851 pmulhrsw m2, m3
15852 pmaddubsw m5, m4, [r5 + 2 * 16]
15853 pmulhrsw m5, m3
15854 packuswb m2, m5
15855 movu [r0 + 457 * 16], m2
15856
15857 ; mode 33 [row 4]
15858 movu [r0 + 500 * 16], m2
15859
15860 ; mode 30 [row 10]
15861 pmaddubsw m2, m1, [r5 + 15 * 16]
15862 pmulhrsw m2, m3
15863 pmaddubsw m5, m4, [r5 + 15 * 16]
15864 pmulhrsw m5, m3
15865 packuswb m2, m5
15866 movu [r0 + 458 * 16], m2
15867
15868 ; mode 30 [row 11]
15869 pmaddubsw m2, m1, [r5 + 28 * 16]
15870 pmulhrsw m2, m3
15871 pmaddubsw m5, m4, [r5 + 28 * 16]
15872 pmulhrsw m5, m3
15873 packuswb m2, m5
15874 movu [r0 + 459 * 16], m2
15875
15876 ; mode 33 [row 5]
15877 movu [r0 + 501 * 16], m2
15878
15879 ; mode 31 [row 7]
15880 pmaddubsw m2, m1, [r5 + 8 * 16]
15881 pmulhrsw m2, m3
15882 pmaddubsw m5, m4, [r5 + 8 * 16]
15883 pmulhrsw m5, m3
15884 packuswb m2, m5
15885 movu [r0 + 471 * 16], m2
15886
15887 ; mode 31 [row 8]
15888 pmaddubsw m2, m1, [r5 + 25 * 16]
15889 pmulhrsw m2, m3
15890 pmaddubsw m5, m4, [r5 + 25 * 16]
15891 pmulhrsw m5, m3
15892 packuswb m2, m5
15893 movu [r0 + 472 * 16], m2
15894
15895 ; mode 32 [row 6]
15896 pmaddubsw m2, m1, [r5 + 19 * 16]
15897 pmulhrsw m2, m3
15898 pmaddubsw m5, m4, [r5 + 19 * 16]
15899 pmulhrsw m5, m3
15900 packuswb m2, m5
15901 movu [r0 + 486 * 16], m2
15902
15903 ; mode 30 [row 12]
15904 movd m7, [r3 + 13]
15905 palignr m7, m1, 2
15906 pmaddubsw m2, m7, [r5 + 9 * 16]
15907 pmulhrsw m2, m3
15908 movd m6, [r3 + 21]
15909 palignr m6, m4, 2
15910 pmaddubsw m5, m6, [r5 + 9 * 16]
15911 pmulhrsw m5, m3
15912 packuswb m2, m5
15913 movu [r0 + 460 * 16], m2
15914
15915 ; mode 30 [row 13]
15916 pmaddubsw m2, m7, [r5 + 22 * 16]
15917 pmulhrsw m2, m3
15918 pmaddubsw m5, m6, [r5 + 22 * 16]
15919 pmulhrsw m5, m3
15920 packuswb m2, m5
15921 movu [r0 + 461 * 16], m2
15922
15923 ; mode 33 [row 6]
15924 movu [r0 + 502 * 16], m2
15925
15926 ; mode 31 [row 9]
15927 pmaddubsw m2, m7, [r5 + 10 * 16]
15928 pmulhrsw m2, m3
15929 pmaddubsw m5, m6, [r5 + 10 * 16]
15930 pmulhrsw m5, m3
15931 packuswb m2, m5
15932 movu [r0 + 473 * 16], m2
15933
15934 ; mode 31 [row 10]
15935 pmaddubsw m2, m7, [r5 + 27 * 16]
15936 pmulhrsw m2, m3
15937 pmaddubsw m5, m6, [r5 + 27 * 16]
15938 pmulhrsw m5, m3
15939 packuswb m2, m5
15940 movu [r0 + 474 * 16], m2
15941
15942 ; mode 32 [row 7]
15943 pmaddubsw m2, m7, [r5 + 8 * 16]
15944 pmulhrsw m2, m3
15945 pmaddubsw m5, m6, [r5 + 8 * 16]
15946 pmulhrsw m5, m3
15947 packuswb m2, m5
15948 movu [r0 + 487 * 16], m2
15949
15950 ; mode 32 [row 8]
15951 pmaddubsw m2, m7, [r5 + 29 * 16]
15952 pmulhrsw m2, m3
15953 pmaddubsw m5, m6, [r5 + 29 * 16]
15954 pmulhrsw m5, m3
15955 packuswb m2, m5
15956 movu [r0 + 488 * 16], m2
15957
15958
15959 movu m1, m7
15960 movu m4, m6
15961
15962 ; mode 30 [row 14]
15963 movd m1, [r3 + 14]
15964 palignr m1, m7, 2
15965 pmaddubsw m2, m1, [r5 + 3 * 16]
15966 pmulhrsw m2, m3
15967 movd m4, [r3 + 22]
15968 palignr m4, m6, 2
15969 pmaddubsw m5, m4, [r5 + 3 * 16]
15970 pmulhrsw m5, m3
15971 packuswb m2, m5
15972 movu [r0 + 462 * 16], m2
15973
15974 ; mode 30 [row 15]
15975 pmaddubsw m2, m1, [r5 + 16 * 16]
15976 pmulhrsw m2, m3
15977 pmaddubsw m5, m4, [r5 + 16 * 16]
15978 pmulhrsw m5, m3
15979 packuswb m2, m5
15980 movu [r0 + 463 * 16], m2
15981
15982 ; mode 33 [row 7]
15983 movu [r0 + 503 * 16], m2
15984
15985 ; mode 31 [row 11]
15986 pmaddubsw m2, m1, [r5 + 12 * 16]
15987 pmulhrsw m2, m3
15988 pmaddubsw m5, m4, [r5 + 12 * 16]
15989 pmulhrsw m5, m3
15990 packuswb m2, m5
15991 movu [r0 + 475 * 16], m2
15992
15993 ; mode 31 [row 12]
15994 pmaddubsw m2, m1, [r5 + 29 * 16]
15995 pmulhrsw m2, m3
15996 pmaddubsw m5, m4, [r5 + 29 * 16]
15997 pmulhrsw m5, m3
15998 packuswb m2, m5
15999 movu [r0 + 476 * 16], m2
16000
16001 ; mode 32 [row 9]
16002 pmaddubsw m2, m1, [r5 + 18 * 16]
16003 pmulhrsw m2, m3
16004 pmaddubsw m5, m4, [r5 + 18 * 16]
16005 pmulhrsw m5, m3
16006 packuswb m2, m5
16007 movu [r0 + 489 * 16], m2
16008
16009 ; mode 31 [row 13]
16010 movd m7, [r3 + 15]
16011 palignr m7, m1, 2
16012 pmaddubsw m2, m7, [r5 + 14 * 16]
16013 pmulhrsw m2, m3
16014 movd m6, [r3 + 23]
16015 palignr m6, m4, 2
16016 pmaddubsw m5, m6, [r5 + 14 * 16]
16017 pmulhrsw m5, m3
16018 packuswb m2, m5
16019 movu [r0 + 477 * 16], m2
16020
16021 ; mode 31 [row 14]
16022 pmaddubsw m2, m7, [r5 + 31 * 16]
16023 pmulhrsw m2, m3
16024 pmaddubsw m5, m6, [r5 + 31 * 16]
16025 pmulhrsw m5, m3
16026 packuswb m2, m5
16027 movu [r0 + 478 * 16], m2
16028
16029 ; mode 32 [row 10]
16030 pmaddubsw m2, m7, [r5 + 7 * 16]
16031 pmulhrsw m2, m3
16032 pmaddubsw m5, m6, [r5 + 7 * 16]
16033 pmulhrsw m5, m3
16034 packuswb m2, m5
16035 movu [r0 + 490 * 16], m2
16036
16037 ; mode 32 [row 11]
16038 pmaddubsw m2, m7, [r5 + 28 * 16]
16039 pmulhrsw m2, m3
16040 pmaddubsw m5, m6, [r5 + 28 * 16]
16041 pmulhrsw m5, m3
16042 packuswb m2, m5
16043 movu [r0 + 491 * 16], m2
16044
16045 ; mode 33 [row 8]
16046 pmaddubsw m2, m7, [r5 + 10 * 16]
16047 pmulhrsw m2, m3
16048 pmaddubsw m5, m6, [r5 + 10 * 16]
16049 pmulhrsw m5, m3
16050 packuswb m2, m5
16051 movu [r0 + 504 * 16], m2
16052
16053 ; mode 31 [row 15]
16054 movd m1, [r3 + 16]
16055 palignr m1, m7, 2
16056 pmaddubsw m2, m1, [r5 + 16 * 16]
16057 pmulhrsw m2, m3
16058 movd m4, [r3 + 24]
16059 palignr m4, m6, 2
16060 pmaddubsw m5, m4, [r5 + 16 * 16]
16061 pmulhrsw m5, m3
16062 packuswb m2, m5
16063 movu [r0 + 479 * 16], m2
16064
16065 ; mode 32 [row 12]
16066 pmaddubsw m2, m1, [r5 + 17 * 16]
16067 pmulhrsw m2, m3
16068 pmaddubsw m5, m4, [r5 + 17 * 16]
16069 pmulhrsw m5, m3
16070 packuswb m2, m5
16071 movu [r0 + 492 * 16], m2
16072
16073 ; mode 33 [row 9]
16074 pmaddubsw m2, m1, [r5 + 4 * 16]
16075 pmulhrsw m2, m3
16076 pmaddubsw m5, m4, [r5 + 4 * 16]
16077 pmulhrsw m5, m3
16078 packuswb m2, m5
16079 movu [r0 + 505 * 16], m2
16080
16081 ; mode 33 [row 10]
16082 pmaddubsw m2, m1, [r5 + 30 * 16]
16083 pmulhrsw m2, m3
16084 pmaddubsw m5, m4, [r5 + 30 * 16]
16085 pmulhrsw m5, m3
16086 packuswb m2, m5
16087 movu [r0 + 506 * 16], m2
16088
16089 ; mode 33 [row 10]
16090 pmaddubsw m2, m1, [r5 + 4 * 16]
16091 pmulhrsw m2, m3
16092 pmaddubsw m5, m4, [r5 + 4 * 16]
16093 pmulhrsw m5, m3
16094 packuswb m2, m5
16095 movu [r0 + 505 * 16], m2
16096
16097 ; mode 32 [row 13]
16098 movd m7, [r3 + 17]
16099 palignr m7, m1, 2
16100 pmaddubsw m2, m7, [r5 + 6 * 16]
16101 pmulhrsw m2, m3
16102
16103 movd m6, [r3 + 25]
16104 palignr m6, m4, 2
16105 pmaddubsw m5, m6, [r5 + 6 * 16]
16106 pmulhrsw m5, m3
16107 packuswb m2, m5
16108 movu [r0 + 493 * 16], m2
16109
16110 ; mode 32 [row 14]
16111 pmaddubsw m2, m7, [r5 + 27 * 16]
16112 pmulhrsw m2, m3
16113 pmaddubsw m5, m6, [r5 + 27 * 16]
16114 pmulhrsw m5, m3
16115 packuswb m2, m5
16116 movu [r0 + 494 * 16], m2
16117
16118 ; mode 33 [row 11]
16119 pmaddubsw m2, m7, [r5 + 24 * 16]
16120 pmulhrsw m2, m3
16121 pmaddubsw m5, m6, [r5 + 24 * 16]
16122 pmulhrsw m5, m3
16123 packuswb m2, m5
16124 movu [r0 + 507 * 16], m2
16125
16126 ; mode 32 [row 15]
16127 movd m1, [r3 + 18]
16128 palignr m1, m7, 2
16129 pmaddubsw m2, m1, [r5 + 16 * 16]
16130 pmulhrsw m2, m3
16131 psrldq m4, 2
16132 pinsrb m4, [r3 + 26], 14
16133 pinsrb m4, [r3 + 27], 15
16134 movd m4, [r3 + 26]
16135 palignr m4, m6, 2
16136 pmaddubsw m5, m4, [r5 + 16 * 16]
16137 pmulhrsw m5, m3
16138 packuswb m2, m5
16139 movu [r0 + 495 * 16], m2
16140
16141 ; mode 33 [row 12]
16142 pmaddubsw m2, m1, [r5 + 18 * 16]
16143 pmulhrsw m2, m3
16144 pmaddubsw m5, m4, [r5 + 18 * 16]
16145 pmulhrsw m5, m3
16146 packuswb m2, m5
16147 movu [r0 + 508 * 16], m2
16148
16149 ; mode 33 [row 13]
16150 movd m7, [r3 + 19]
16151 palignr m7, m1, 2
16152 pmaddubsw m2, m7, [r5 + 12 * 16]
16153 pmulhrsw m2, m3
16154 movd m6, [r3 + 27]
16155 palignr m6, m4, 2
16156 pmaddubsw m5, m6, [r5 + 12 * 16]
16157 pmulhrsw m5, m3
16158 packuswb m2, m5
16159 movu [r0 + 509 * 16], m2
16160
16161 ; mode 33 [row 14]
16162 movd m1, [r3 + 20]
16163 palignr m1, m7, 2
16164 pmaddubsw m2, m1, [r5 + 6 * 16]
16165 pmulhrsw m2, m3
16166 movd m4, [r3 + 28]
16167 palignr m4, m6, 2
16168 pmaddubsw m5, m4, [r5 + 6 * 16]
16169 pmulhrsw m5, m3
16170 packuswb m2, m5
16171 movu [r0 + 510 * 16], m2
16172
16173 ; mode 34 [row 0]
16174 movu m1, [r3 + 2]
16175 movu [r0 + 512 * 16], m1
16176 movu m2, [r3 + 18]
16177 palignr m3, m2, m1, 1
16178 movu [r0 + 513 * 16], m3
16179 palignr m3, m2, m1, 2
16180 movu [r0 + 514 * 16], m3
16181 palignr m3, m2, m1, 3
16182 movu [r0 + 515 * 16], m3
16183 palignr m3, m2, m1, 4
16184 movu [r0 + 516 * 16], m3
16185 palignr m3, m2, m1, 5
16186 movu [r0 + 517 * 16], m3
16187 palignr m3, m2, m1, 6
16188 movu [r0 + 518 * 16], m3
16189 palignr m3, m2, m1, 7
16190 movu [r0 + 519 * 16], m3
16191 palignr m3, m2, m1, 8
16192 movu [r0 + 520 * 16], m3
16193 palignr m3, m2, m1, 9
16194 movu [r0 + 521 * 16], m3
16195 palignr m3, m2, m1, 10
16196 movu [r0 + 522 * 16], m3
16197 palignr m3, m2, m1, 11
16198 movu [r0 + 523 * 16], m3
16199 palignr m3, m2, m1, 12
16200 movu [r0 + 524 * 16], m3
16201
16202 ; mode 33 [row 15]
16203 movu [r0 + 511 * 16], m3
16204
16205 ; mode 34
16206 palignr m3, m2, m1, 13
16207 movu [r0 + 525 * 16], m3
16208 palignr m3, m2, m1, 14
16209 movu [r0 + 526 * 16], m3
16210 palignr m3, m2, m1, 15
16211 movu [r0 + 527 * 16], m3
16212
16213 RET
16214
16215 ;-----------------------------------------------------------------------------
16216 ; void all_angs_pred_32x32(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
16217 ;-----------------------------------------------------------------------------
16218 INIT_XMM sse4
16219 cglobal all_angs_pred_32x32, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
16220
16221 ;mode 2[row 0]
16222 movu m0, [r4 + 2]
16223 movu [r0 + 0 * 16], m0
16224 movu m1, [r4 + 18]
16225 movu [r0 + 1 * 16], m1
16226
16227 ;mode 9 [row 15]
16228 movu [r0 + 478 * 16], m0
16229 movu [r0 + 479 * 16], m1
16230
16231 ;mode 2[row 1]
16232 movu m2, [r4 + 34]
16233 palignr m3, m1, m0, 1
16234 movu [r0 + 2 * 16], m3
16235 palignr m4, m2, m1, 1
16236 movu [r0 + 3 * 16], m4
16237
16238 ; mode 9 [row 31]
16239 movu [r0 + 510 * 16], m3
16240 movu [r0 + 511 * 16], m4
16241
16242 ;mode 2[row 17]
16243 movu [r0 + 34 * 16], m4
16244 movu m5, [r4 + 35]
16245 movu [r0 + 35 * 16], m5
16246
16247 ;mode 2[row 2]
16248 palignr m3, m1, m0, 2
16249 movu [r0 + 4 * 16], m3
16250 palignr m4, m2, m1, 2
16251 movu [r0 + 5 * 16], m4
16252
16253 ;mode 2[row 18]
16254 movu [r0 + 36 * 16], m4
16255 movu m6, [r4 + 51]
16256 palignr m7, m6, m5, 1
16257 movu [r0 + 37 * 16], m7
16258
16259 ;mode 2[row 3]
16260 palignr m3, m1, m0, 3
16261 movu [r0 + 6 * 16], m3
16262 palignr m4, m2, m1, 3
16263 movu [r0 + 7 * 16], m4
16264
16265 ;mode 2[row 19]
16266 movu [r0 + 38 * 16], m4
16267 palignr m7, m6, m5, 2
16268 movu [r0 + 39 * 16], m7
16269
16270 ;mode 2[row 4]
16271 palignr m3, m1, m0, 4
16272 movu [r0 + 8 * 16], m3
16273 palignr m4, m2, m1, 4
16274 movu [r0 + 9 * 16], m4
16275
16276 ; mode 8 [row 31]
16277 movu [r0 + 446 * 16], m3
16278 movu [r0 + 447 * 16], m4
16279
16280 ;mode 2[row 20]
16281 movu [r0 + 40 * 16], m4
16282 palignr m7, m6, m5, 3
16283 movu [r0 + 41 * 16], m7
16284
16285 ; mode 4 [row 31]
16286 movu [r0 + 190 * 16], m4
16287 movu [r0 + 191 * 16], m7
16288
16289 ;mode 2[row 5]
16290 palignr m3, m1, m0, 5
16291 movu [r0 + 10 * 16], m3
16292 palignr m4, m2, m1, 5
16293 movu [r0 + 11 * 16], m4
16294
16295 ;mode 2[row 21]
16296 movu [r0 + 42 * 16], m4
16297 palignr m7, m6, m5, 4
16298 movu [r0 + 43 * 16], m7
16299
16300 ;mode 2[row 6]
16301 palignr m3, m1, m0, 6
16302 movu [r0 + 12 * 16], m3
16303 palignr m4, m2, m1, 6
16304 movu [r0 + 13 * 16], m4
16305
16306 ;mode 2[row 22]
16307 movu [r0 + 44 * 16], m4
16308 palignr m7, m6, m5, 5
16309 movu [r0 + 45 * 16], m7
16310
16311 ;mode 2[row 7]
16312 palignr m3, m1, m0, 7
16313 movu [r0 + 14 * 16], m3
16314 palignr m4, m2, m1, 7
16315 movu [r0 + 15 * 16], m4
16316
16317 ;mode 2[row 23]
16318 movu [r0 + 46 * 16], m4
16319 palignr m7, m6, m5, 6
16320 movu [r0 + 47 * 16], m7
16321
16322 ;mode 2[row 8]
16323 palignr m3, m1, m0, 8
16324 movu [r0 + 16 * 16], m3
16325 palignr m4, m2, m1, 8
16326 movu [r0 + 17 * 16], m4
16327
16328 ;mode 7[row 31]
16329 movu [r0 + 382 * 16], m3
16330 movu [r0 + 383 * 16], m4
16331
16332 ;mode 2[row 24]
16333 movu [r0 + 48 * 16], m4
16334 palignr m7, m6, m5, 7
16335 movu [r0 + 49 * 16], m7
16336
16337 ;mode 2[row 9]
16338 palignr m3, m1, m0, 9
16339 movu [r0 + 18 * 16], m3
16340 palignr m4, m2, m1, 9
16341 movu [r0 + 19 * 16], m4
16342
16343 ;mode 2[row 25]
16344 movu [r0 + 50 * 16], m4
16345 palignr m7, m6, m5, 8
16346 movu [r0 + 51 * 16], m7
16347
16348 ; mode 3 [row 31]
16349 movu [r0 + 126 * 16], m4
16350 movu [r0 + 127 * 16], m7
16351
16352 ;mode 2[row 10]
16353 palignr m3, m1, m0, 10
16354 movu [r0 + 20 * 16], m3
16355 palignr m4, m2, m1, 10
16356 movu [r0 + 21 * 16], m4
16357
16358 ;mode 2[row 26]
16359 movu [r0 + 52 * 16], m4
16360 palignr m7, m6, m5, 9
16361 movu [r0 + 53 * 16], m7
16362
16363 ;mode 2[row 11]
16364 palignr m3, m1, m0, 11
16365 movu [r0 + 22 * 16], m3
16366 palignr m4, m2, m1, 11
16367 movu [r0 + 23 * 16], m4
16368
16369 ;mode 2[row 27]
16370 movu [r0 + 54 * 16], m4
16371 palignr m7, m6, m5, 10
16372 movu [r0 + 55 * 16], m7
16373
16374 ;mode 2[row 12]
16375 palignr m3, m1, m0, 12
16376 movu [r0 + 24 * 16], m3
16377 palignr m4, m2, m1, 12
16378 movu [r0 + 25 * 16], m4
16379
16380 ; mode 6 [row 31]
16381 movu [r0 + 318 * 16], m3
16382 movu [r0 + 319 * 16], m4
16383
16384 ; mode 3 [row 15]
16385 movu [r0 + 94 * 16], m3
16386 movu [r0 + 95 * 16], m4
16387
16388 ;mode 2[row 28]
16389 movu [r0 + 56 * 16], m4
16390 palignr m7, m6, m5, 11
16391 movu [r0 + 57 * 16], m7
16392
16393 ;mode 2[row 13]
16394 palignr m3, m1, m0, 13
16395 movu [r0 + 26 * 16], m3
16396 palignr m4, m2, m1, 13
16397 movu [r0 + 27 * 16], m4
16398
16399 ;mode 2[row 29]
16400 movu [r0 + 58 * 16], m4
16401 palignr m7, m6, m5, 12
16402 movu [r0 + 59 * 16], m7
16403
16404 ;mode 2[row 14]
16405 palignr m3, m1, m0, 14
16406 movu [r0 + 28 * 16], m3
16407 palignr m4, m2, m1, 14
16408 movu [r0 + 29 * 16], m4
16409
16410 ;mode 2[row 30]
16411 movu [r0 + 60 * 16], m4
16412 palignr m7, m6, m5, 13
16413 movu [r0 + 61 * 16], m7
16414
16415 ;mode 2[row 15]
16416 palignr m3, m1, m0, 15
16417 movu [r0 + 30 * 16], m3
16418 palignr m4, m2, m1, 15
16419 movu [r0 + 31 * 16], m4
16420
16421 ;mode 2[row 31]
16422 movu [r0 + 62 * 16], m4
16423 palignr m7, m6, m5, 14
16424 movu [r0 + 63 * 16], m7
16425
16426 ;mode 2[row 16]
16427 movu [r0 + 32 * 16], m1
16428 movu [r0 + 33 * 16], m2
16429
16430 ; mode 5[row 31]
16431 movu [r0 + 254 * 16], m1
16432 movu [r0 + 255 * 16], m2
16433
16434 ; mode 3 [row 0]
16435 lea r5, [ang_table]
16436 movu m6, [r5 + 26 * 16]
16437 movu m7, [pw_1024 ]
16438 movu m1, [r4 + 1 ]
16439 punpcklbw m1, m0
16440 pmaddubsw m0, m1, m6
16441 pmulhrsw m0, m7
16442 movu m2, [r4 + 9]
16443 movd m3, [r4 + 10]
16444 palignr m3, m2, 1
16445 punpcklbw m2, m3
16446 pmaddubsw m3, m2, m6
16447 pmulhrsw m3, m7
16448 packuswb m0, m3
16449 movu [r0 + 64 * 16], m0
16450
16451 ; mode 6 [row 1 - first half]
16452 movu [r0 + 258 * 16], m0
16453
16454 ; mode 9 [row 12 - first half]
16455 movu [r0 + 472 * 16], m0
16456
16457 movu m0, [r4 + 17]
16458 movd m3, [r4 + 18]
16459 palignr m3, m0, 1
16460 punpcklbw m0, m3
16461 pmaddubsw m3, m0, m6
16462 pmulhrsw m3, m7
16463 movu m4, [r4 + 25]
16464 movd m5, [r4 + 26]
16465 palignr m5, m4, 1
16466 punpcklbw m4, m5
16467 pmaddubsw m5, m4, m6
16468 pmulhrsw m5, m7
16469 packuswb m3, m5
16470 movu [r0 + 65 * 16], m3
16471
16472 ; mode 6 [row 1 - second half]
16473 movu [r0 + 259 * 16], m3
16474
16475 ; mode 9 [row 12 - second half]
16476 movu [r0 + 473 * 16], m3
16477
16478 ; mode 4 [row 0]
16479 movu m6, [r5 + 21 * 16]
16480 pmaddubsw m3, m1, m6
16481 pmulhrsw m3, m7
16482 pmaddubsw m5, m2, m6
16483 pmulhrsw m5, m7
16484 packuswb m3, m5
16485 movu [r0 + 128 * 16], m3
16486 pmaddubsw m3, m0, m6
16487 pmulhrsw m3, m7
16488 pmaddubsw m5, m4, m6
16489 pmulhrsw m5, m7
16490 packuswb m3, m5
16491 movu [r0 + 129 * 16], m3
16492
16493 ; mode 5 [row 0]
16494 movu m6, [r5 + 17 * 16]
16495 pmaddubsw m3, m1, m6
16496 pmulhrsw m3, m7
16497 pmaddubsw m5, m2, m6
16498 pmulhrsw m5, m7
16499 packuswb m3, m5
16500 movu [r0 + 192 * 16], m3
16501 pmaddubsw m3, m0, m6
16502 pmulhrsw m3, m7
16503 pmaddubsw m5, m4, m6
16504 pmulhrsw m5, m7
16505 packuswb m3, m5
16506 movu [r0 + 193 * 16], m3
16507
16508 ; mode 6 [row 0]
16509 movu m6, [r5 + 13 * 16]
16510 pmaddubsw m3, m1, m6
16511 pmulhrsw m3, m7
16512 pmaddubsw m5, m2, m6
16513 pmulhrsw m5, m7
16514 packuswb m3, m5
16515 movu [r0 + 256 * 16], m3
16516 pmaddubsw m3, m0, m6
16517 pmulhrsw m3, m7
16518 pmaddubsw m5, m4, m6
16519 pmulhrsw m5, m7
16520 packuswb m3, m5
16521 movu [r0 + 257 * 16], m3
16522
16523 ; mode 7 [row 0]
16524 movu m6, [r5 + 9 * 16]
16525 pmaddubsw m3, m1, m6
16526 pmulhrsw m3, m7
16527 pmaddubsw m5, m2, m6
16528 pmulhrsw m5, m7
16529 packuswb m3, m5
16530 movu [r0 + 320 * 16], m3
16531 pmaddubsw m3, m0, m6
16532 pmulhrsw m3, m7
16533 pmaddubsw m5, m4, m6
16534 pmulhrsw m5, m7
16535 packuswb m3, m5
16536 movu [r0 + 321 * 16], m3
16537
16538 ; mode 7 [row 1]
16539 movu m6, [r5 + 18 * 16]
16540 pmaddubsw m3, m1, m6
16541 pmulhrsw m3, m7
16542 pmaddubsw m5, m2, m6
16543 pmulhrsw m5, m7
16544 packuswb m3, m5
16545 movu [r0 + 322 * 16], m3
16546
16547 ; mode 9 [row 8 - first half]
16548 movu [r0 + 464 * 16], m3
16549
16550 pmaddubsw m3, m0, m6
16551 pmulhrsw m3, m7
16552 pmaddubsw m5, m4, m6
16553 pmulhrsw m5, m7
16554 packuswb m3, m5
16555 movu [r0 + 323 * 16], m3
16556
16557 ; mode 9 [row 8 - second half]
16558 movu [r0 + 465 * 16], m3
16559
16560 ; mode 7 [row 2]
16561 movu m6, [r5 + 27 * 16]
16562 pmaddubsw m3, m1, m6
16563 pmulhrsw m3, m7
16564 pmaddubsw m5, m2, m6
16565 pmulhrsw m5, m7
16566 packuswb m3, m5
16567 movu [r0 + 324 * 16], m3
16568 pmaddubsw m3, m0, m6
16569 pmulhrsw m3, m7
16570 pmaddubsw m5, m4, m6
16571 pmulhrsw m5, m7
16572 packuswb m3, m5
16573 movu [r0 + 325 * 16], m3
16574
16575 ; mode 8 [row 0]
16576 movu m6, [r5 + 5 * 16]
16577 pmaddubsw m3, m1, m6
16578 pmulhrsw m3, m7
16579 pmaddubsw m5, m2, m6
16580 pmulhrsw m5, m7
16581 packuswb m3, m5
16582 movu [r0 + 384 * 16], m3
16583 pmaddubsw m3, m0, m6
16584 pmulhrsw m3, m7
16585 pmaddubsw m5, m4, m6
16586 pmulhrsw m5, m7
16587 packuswb m3, m5
16588 movu [r0 + 385 * 16], m3
16589
16590 ; mode 8 [row 1]
16591 movu m6, [r5 + 10 * 16]
16592 pmaddubsw m3, m1, m6
16593 pmulhrsw m3, m7
16594 pmaddubsw m5, m2, m6
16595 pmulhrsw m5, m7
16596 packuswb m3, m5
16597 movu [r0 + 386 * 16], m3
16598
16599 ; mode 9 [row 4 - first half]
16600 movu [r0 + 456 * 16], m3
16601
16602 pmaddubsw m3, m0, m6
16603 pmulhrsw m3, m7
16604 pmaddubsw m5, m4, m6
16605 pmulhrsw m5, m7
16606 packuswb m3, m5
16607 movu [r0 + 387 * 16], m3
16608
16609 ; mode 9 [row 4 - second half]
16610 movu [r0 + 457 * 16], m3
16611
16612 ; mode 8 [row 2]
16613 movu m6, [r5 + 15 * 16]
16614 pmaddubsw m3, m1, m6
16615 pmulhrsw m3, m7
16616 pmaddubsw m5, m2, m6
16617 pmulhrsw m5, m7
16618 packuswb m3, m5
16619 movu [r0 + 388 * 16], m3
16620 pmaddubsw m3, m0, m6
16621 pmulhrsw m3, m7
16622 pmaddubsw m5, m4, m6
16623 pmulhrsw m5, m7
16624 packuswb m3, m5
16625 movu [r0 + 389 * 16], m3
16626
16627 ; mode 8 [row 3]
16628 movu m6, [r5 + 20 * 16]
16629 pmaddubsw m3, m1, m6
16630 pmulhrsw m3, m7
16631 pmaddubsw m5, m2, m6
16632 pmulhrsw m5, m7
16633 packuswb m3, m5
16634 movu [r0 + 390 * 16], m3
16635
16636 ; mode 9 [row 9 - first half]
16637 movu [r0 + 466 * 16], m3
16638
16639 pmaddubsw m3, m0, m6
16640 pmulhrsw m3, m7
16641 pmaddubsw m5, m4, m6
16642 pmulhrsw m5, m7
16643 packuswb m3, m5
16644 movu [r0 + 391 * 16], m3
16645
16646 ; mode 9 [row 9 - second half]
16647 movu [r0 + 467 * 16], m3
16648
16649 ; mode 8 [row 4]
16650 movu m6, [r5 + 25 * 16]
16651 pmaddubsw m3, m1, m6
16652 pmulhrsw m3, m7
16653 pmaddubsw m5, m2, m6
16654 pmulhrsw m5, m7
16655 packuswb m3, m5
16656 movu [r0 + 392 * 16], m3
16657 pmaddubsw m3, m0, m6
16658 pmulhrsw m3, m7
16659 pmaddubsw m5, m4, m6
16660 pmulhrsw m5, m7
16661 packuswb m3, m5
16662 movu [r0 + 393 * 16], m3
16663
16664 ; mode 8 [row 5]
16665 movu m6, [r5 + 30 * 16]
16666 pmaddubsw m3, m1, m6
16667 pmulhrsw m3, m7
16668 pmaddubsw m5, m2, m6
16669 pmulhrsw m5, m7
16670 packuswb m3, m5
16671 movu [r0 + 394 * 16], m3
16672
16673 ; mode 9 [row 14 - first half]
16674 movu [r0 + 476 * 16], m3
16675
16676 pmaddubsw m3, m0, m6
16677 pmulhrsw m3, m7
16678 pmaddubsw m5, m4, m6
16679 pmulhrsw m5, m7
16680 packuswb m3, m5
16681 movu [r0 + 395 * 16], m3
16682
16683 ; mode 9 [row 14 - second half]
16684 movu [r0 + 477 * 16], m3
16685
16686 ; mode 9 [row 0]
16687 movu m6, [r5 + 2 * 16]
16688 pmaddubsw m3, m1, m6
16689 pmulhrsw m3, m7
16690 pmaddubsw m5, m2, m6
16691 pmulhrsw m5, m7
16692 packuswb m3, m5
16693 movu [r0 + 448 * 16], m3
16694 pmaddubsw m3, m0, m6
16695 pmulhrsw m3, m7
16696 pmaddubsw m5, m4, m6
16697 pmulhrsw m5, m7
16698 packuswb m3, m5
16699 movu [r0 + 449 * 16], m3
16700
16701 ; mode 9 [row 1]
16702 movu m6, [r5 + 4 * 16]
16703 pmaddubsw m3, m1, m6
16704 pmulhrsw m3, m7
16705 pmaddubsw m5, m2, m6
16706 pmulhrsw m5, m7
16707 packuswb m3, m5
16708 movu [r0 + 450 * 16], m3
16709 pmaddubsw m3, m0, m6
16710 pmulhrsw m3, m7
16711 pmaddubsw m5, m4, m6
16712 pmulhrsw m5, m7
16713 packuswb m3, m5
16714 movu [r0 + 451 * 16], m3
16715
16716 ; mode 9 [row 2]
16717 movu m6, [r5 + 6 * 16]
16718 pmaddubsw m3, m1, m6
16719 pmulhrsw m3, m7
16720 pmaddubsw m5, m2, m6
16721 pmulhrsw m5, m7
16722 packuswb m3, m5
16723 movu [r0 + 452 * 16], m3
16724 pmaddubsw m3, m0, m6
16725 pmulhrsw m3, m7
16726 pmaddubsw m5, m4, m6
16727 pmulhrsw m5, m7
16728 packuswb m3, m5
16729 movu [r0 + 453 * 16], m3
16730
16731 ; mode 9 [row 3]
16732 movu m6, [r5 + 8 * 16]
16733 pmaddubsw m3, m1, m6
16734 pmulhrsw m3, m7
16735 pmaddubsw m5, m2, m6
16736 pmulhrsw m5, m7
16737 packuswb m3, m5
16738 movu [r0 + 454 * 16], m3
16739 pmaddubsw m3, m0, m6
16740 pmulhrsw m3, m7
16741 pmaddubsw m5, m4, m6
16742 pmulhrsw m5, m7
16743 packuswb m3, m5
16744 movu [r0 + 455 * 16], m3
16745
16746 ; mode 9 [row 5]
16747 movu m6, [r5 + 12 * 16]
16748 pmaddubsw m3, m1, m6
16749 pmulhrsw m3, m7
16750 pmaddubsw m5, m2, m6
16751 pmulhrsw m5, m7
16752 packuswb m3, m5
16753 movu [r0 + 458 * 16], m3
16754 pmaddubsw m3, m0, m6
16755 pmulhrsw m3, m7
16756 pmaddubsw m5, m4, m6
16757 pmulhrsw m5, m7
16758 packuswb m3, m5
16759 movu [r0 + 459 * 16], m3
16760
16761 ; mode 9 [row 6]
16762 movu m6, [r5 + 14 * 16]
16763 pmaddubsw m3, m1, m6
16764 pmulhrsw m3, m7
16765 pmaddubsw m5, m2, m6
16766 pmulhrsw m5, m7
16767 packuswb m3, m5
16768 movu [r0 + 460 * 16], m3
16769 pmaddubsw m3, m0, m6
16770 pmulhrsw m3, m7
16771 pmaddubsw m5, m4, m6
16772 pmulhrsw m5, m7
16773 packuswb m3, m5
16774 movu [r0 + 461 * 16], m3
16775
16776 ; mode 9 [row 7]
16777 movu m6, [r5 + 16 * 16]
16778 pmaddubsw m3, m1, m6
16779 pmulhrsw m3, m7
16780 pmaddubsw m5, m2, m6
16781 pmulhrsw m5, m7
16782 packuswb m3, m5
16783 movu [r0 + 462 * 16], m3
16784 pmaddubsw m3, m0, m6
16785 pmulhrsw m3, m7
16786 pmaddubsw m5, m4, m6
16787 pmulhrsw m5, m7
16788 packuswb m3, m5
16789 movu [r0 + 463 * 16], m3
16790
16791 ; mode 9 [row 10]
16792 movu m6, [r5 + 22 * 16]
16793 pmaddubsw m3, m1, m6
16794 pmulhrsw m3, m7
16795 pmaddubsw m5, m2, m6
16796 pmulhrsw m5, m7
16797 packuswb m3, m5
16798 movu [r0 + 468 * 16], m3
16799 pmaddubsw m3, m0, m6
16800 pmulhrsw m3, m7
16801 pmaddubsw m5, m4, m6
16802 pmulhrsw m5, m7
16803 packuswb m3, m5
16804 movu [r0 + 469 * 16], m3
16805
16806 ; mode 9 [row 11]
16807 movu m6, [r5 + 24 * 16]
16808 pmaddubsw m3, m1, m6
16809 pmulhrsw m3, m7
16810 pmaddubsw m5, m2, m6
16811 pmulhrsw m5, m7
16812 packuswb m3, m5
16813 movu [r0 + 470 * 16], m3
16814 pmaddubsw m3, m0, m6
16815 pmulhrsw m3, m7
16816 pmaddubsw m5, m4, m6
16817 pmulhrsw m5, m7
16818 packuswb m3, m5
16819 movu [r0 + 471 * 16], m3
16820
16821 ; mode 9 [row 13]
16822 movu m6, [r5 + 28 * 16]
16823 pmaddubsw m3, m1, m6
16824 pmulhrsw m3, m7
16825 pmaddubsw m5, m2, m6
16826 pmulhrsw m5, m7
16827 packuswb m3, m5
16828 movu [r0 + 474 * 16], m3
16829 pmaddubsw m3, m0, m6
16830 pmulhrsw m3, m7
16831 pmaddubsw m5, m4, m6
16832 pmulhrsw m5, m7
16833 packuswb m3, m5
16834 movu [r0 + 475 * 16], m3
16835
16836 ; mode 3 [row 1]
16837 movu m6, [r5 + 20 * 16]
16838 movu m0, [r4 + 2]
16839 movd m1, [r4 + 3]
16840 palignr m1, m0, 1
16841 punpcklbw m0, m1
16842 pmaddubsw m1, m0, m6
16843 pmulhrsw m1, m7
16844 movu m2, [r4 + 10]
16845 movd m3, [r4 + 11]
16846 palignr m3, m2, 1
16847 punpcklbw m2, m3
16848 pmaddubsw m3, m2, m6
16849 pmulhrsw m3, m7
16850 packuswb m1, m3
16851 movu [r0 + 66 * 16], m1
16852
16853 ; mode 6 [row 3 - first half]
16854 movu [r0 + 262 * 16], m1
16855
16856 ; mode 9 [row 25 - first half]
16857 movu [r0 + 498 * 16], m1
16858
16859 movu m1, [r4 + 18]
16860 movd m3, [r4 + 19]
16861 palignr m3, m1, 1
16862 punpcklbw m1, m3
16863 pmaddubsw m3, m1, m6
16864 pmulhrsw m3, m7
16865 movu m4, [r4 + 26]
16866 movd m5, [r4 + 27]
16867 palignr m5, m4, 1
16868 punpcklbw m4, m5
16869 pmaddubsw m5, m4, m6
16870 pmulhrsw m5, m7
16871 packuswb m3, m5
16872 movu [r0 + 67 * 16], m3
16873
16874 ; mode 6 [row 3 - second half]
16875 movu [r0 + 263 * 16], m3
16876
16877 ; mode 9 [row 25 - second half]
16878 movu [r0 + 499 * 16], m3
16879
16880 ; mode 4 [row 1]
16881 movu m6, [r5 + 10 * 16]
16882 pmaddubsw m3, m0, m6
16883 pmulhrsw m3, m7
16884 pmaddubsw m5, m2, m6
16885 pmulhrsw m5, m7
16886 packuswb m3, m5
16887 movu [r0 + 130 * 16], m3
16888
16889 ; mode 9 [row 20 - first half]
16890 movu [r0 + 488 * 16], m3
16891
16892 pmaddubsw m3, m1, m6
16893 pmulhrsw m3, m7
16894 pmaddubsw m5, m4, m6
16895 pmulhrsw m5, m7
16896 packuswb m3, m5
16897 movu [r0 + 131 * 16], m3
16898
16899 ; mode 9 [row 20 - second half]
16900 movu [r0 + 489 * 16], m3
16901
16902 ; mode 4 [row 2]
16903 movu m6, [r5 + 31 * 16]
16904 pmaddubsw m3, m0, m6
16905 pmulhrsw m3, m7
16906 pmaddubsw m5, m2, m6
16907 pmulhrsw m5, m7
16908 packuswb m3, m5
16909 movu [r0 + 132 * 16], m3
16910
16911 ; mode 7 [row 6 - first half]
16912 movu [r0 + 332 * 16], m3
16913
16914 pmaddubsw m3, m1, m6
16915 pmulhrsw m3, m7
16916 pmaddubsw m5, m4, m6
16917 pmulhrsw m5, m7
16918 packuswb m3, m5
16919 movu [r0 + 133 * 16], m3
16920
16921 ; mode 7 [row 6 - second half]
16922 movu [r0 + 333 * 16], m3
16923
16924 ; mode 5 [row 1]
16925 movu m6, [r5 + 2 * 16]
16926 pmaddubsw m3, m0, m6
16927 pmulhrsw m3, m7
16928 pmaddubsw m5, m2, m6
16929 pmulhrsw m5, m7
16930 packuswb m3, m5
16931 movu [r0 + 194 * 16], m3
16932
16933 ; mode 5 [row 1 - first half]
16934 movu [r0 + 480 * 16], m3
16935
16936 pmaddubsw m3, m1, m6
16937 pmulhrsw m3, m7
16938 pmaddubsw m5, m4, m6
16939 pmulhrsw m5, m7
16940 packuswb m3, m5
16941 movu [r0 + 195 * 16], m3
16942
16943 ; mode 5 [row 1 - second half]
16944 movu [r0 + 481 * 16], m3
16945
16946 ; mode 5 [row 2]
16947 movu m6, [r5 + 19 * 16]
16948 pmaddubsw m3, m0, m6
16949 pmulhrsw m3, m7
16950 pmaddubsw m5, m2, m6
16951 pmulhrsw m5, m7
16952 packuswb m3, m5
16953 movu [r0 + 196 * 16], m3
16954 pmaddubsw m3, m1, m6
16955 pmulhrsw m3, m7
16956 pmaddubsw m5, m4, m6
16957 pmulhrsw m5, m7
16958 packuswb m3, m5
16959 movu [r0 + 197 * 16], m3
16960
16961 ; mode 6 [row 2]
16962 movu m6, [r5 + 7 * 16]
16963 pmaddubsw m3, m0, m6
16964 pmulhrsw m3, m7
16965 pmaddubsw m5, m2, m6
16966 pmulhrsw m5, m7
16967 packuswb m3, m5
16968 movu [r0 + 260 * 16], m3
16969 pmaddubsw m3, m1, m6
16970 pmulhrsw m3, m7
16971 pmaddubsw m5, m4, m6
16972 pmulhrsw m5, m7
16973 packuswb m3, m5
16974 movu [r0 + 261 * 16], m3
16975
16976 ; mode 7 [row 3]
16977 movu m6, [r5 + 4 * 16]
16978 pmaddubsw m3, m0, m6
16979 pmulhrsw m3, m7
16980 pmaddubsw m5, m2, m6
16981 pmulhrsw m5, m7
16982 packuswb m3, m5
16983 movu [r0 + 326 * 16], m3
16984
16985 ; mode 9 [row 17 - first half]
16986 movu [r0 + 482 * 16], m3
16987
16988 pmaddubsw m3, m1, m6
16989 pmulhrsw m3, m7
16990 pmaddubsw m5, m4, m6
16991 pmulhrsw m5, m7
16992 packuswb m3, m5
16993 movu [r0 + 327 * 16], m3
16994
16995 ; mode 9 [row 17 - second half]
16996 movu [r0 + 483 * 16], m3
16997
16998 ; mode 7 [row 4]
16999 movu m6, [r5 + 13 * 16]
17000 pmaddubsw m3, m0, m6
17001 pmulhrsw m3, m7
17002 pmaddubsw m5, m2, m6
17003 pmulhrsw m5, m7
17004 packuswb m3, m5
17005 movu [r0 + 328 * 16], m3
17006
17007 ; mode 8 [row 8 - first half]
17008 movu [r0 + 400 * 16], m3
17009
17010 pmaddubsw m3, m1, m6
17011 pmulhrsw m3, m7
17012 pmaddubsw m5, m4, m6
17013 pmulhrsw m5, m7
17014 packuswb m3, m5
17015 movu [r0 + 329 * 16], m3
17016
17017 ; mode 8 [row 8 - second half]
17018 movu [r0 + 401 * 16], m3
17019
17020 ; mode 7 [row 5]
17021 movu m6, [r5 + 22 * 16]
17022 pmaddubsw m3, m0, m6
17023 pmulhrsw m3, m7
17024 pmaddubsw m5, m2, m6
17025 pmulhrsw m5, m7
17026 packuswb m3, m5
17027 movu [r0 + 330 * 16], m3
17028
17029 ; mode 9 [row 26 - first half]
17030 movu [r0 + 500 * 16], m3
17031
17032 pmaddubsw m3, m1, m6
17033 pmulhrsw m3, m7
17034 pmaddubsw m5, m4, m6
17035 pmulhrsw m5, m7
17036 packuswb m3, m5
17037 movu [r0 + 331 * 16], m3
17038
17039 ; mode 9 [row 26 - second half]
17040 movu [r0 + 501 * 16], m3
17041
17042 ; mode 8 [row 6]
17043 movu m6, [r5 + 3 * 16]
17044 pmaddubsw m3, m0, m6
17045 pmulhrsw m3, m7
17046 pmaddubsw m5, m2, m6
17047 pmulhrsw m5, m7
17048 packuswb m3, m5
17049 movu [r0 + 396 * 16], m3
17050 pmaddubsw m3, m1, m6
17051 pmulhrsw m3, m7
17052 pmaddubsw m5, m4, m6
17053 pmulhrsw m5, m7
17054 packuswb m3, m5
17055 movu [r0 + 397 * 16], m3
17056
17057 ; mode 9 [row 18]
17058 movu m6, [r5 + 6 * 16]
17059 pmaddubsw m3, m0, m6
17060 pmulhrsw m3, m7
17061 pmaddubsw m5, m2, m6
17062 pmulhrsw m5, m7
17063 packuswb m3, m5
17064 movu [r0 + 484 * 16], m3
17065 pmaddubsw m3, m1, m6
17066 pmulhrsw m3, m7
17067 pmaddubsw m5, m4, m6
17068 pmulhrsw m5, m7
17069 packuswb m3, m5
17070 movu [r0 + 485 * 16], m3
17071
17072 ; mode 9 [row 21]
17073 movu m6, [r5 + 12 * 16]
17074 pmaddubsw m3, m0, m6
17075 pmulhrsw m3, m7
17076 pmaddubsw m5, m2, m6
17077 pmulhrsw m5, m7
17078 packuswb m3, m5
17079 movu [r0 + 490 * 16], m3
17080 pmaddubsw m3, m1, m6
17081 pmulhrsw m3, m7
17082 pmaddubsw m5, m4, m6
17083 pmulhrsw m5, m7
17084 packuswb m3, m5
17085 movu [r0 + 491 * 16], m3
17086
17087 ; mode 9 [row 22]
17088 movu m6, [r5 + 14 * 16]
17089 pmaddubsw m3, m0, m6
17090 pmulhrsw m3, m7
17091 pmaddubsw m5, m2, m6
17092 pmulhrsw m5, m7
17093 packuswb m3, m5
17094 movu [r0 + 492 * 16], m3
17095 pmaddubsw m3, m1, m6
17096 pmulhrsw m3, m7
17097 pmaddubsw m5, m4, m6
17098 pmulhrsw m5, m7
17099 packuswb m3, m5
17100 movu [r0 + 493 * 16], m3
17101
17102 ; mode 9 [row 23]
17103 movu m6, [r5 + 16 * 16]
17104 pmaddubsw m3, m0, m6
17105 pmulhrsw m3, m7
17106 pmaddubsw m5, m2, m6
17107 pmulhrsw m5, m7
17108 packuswb m3, m5
17109 movu [r0 + 494 * 16], m3
17110 pmaddubsw m3, m1, m6
17111 pmulhrsw m3, m7
17112 pmaddubsw m5, m4, m6
17113 pmulhrsw m5, m7
17114 packuswb m3, m5
17115 movu [r0 + 495 * 16], m3
17116
17117 ; mode 9 [row 27]
17118 movu m6, [r5 + 24 * 16]
17119 pmaddubsw m3, m0, m6
17120 pmulhrsw m3, m7
17121 pmaddubsw m5, m2, m6
17122 pmulhrsw m5, m7
17123 packuswb m3, m5
17124 movu [r0 + 502 * 16], m3
17125 pmaddubsw m3, m1, m6
17126 pmulhrsw m3, m7
17127 pmaddubsw m5, m4, m6
17128 pmulhrsw m5, m7
17129 packuswb m3, m5
17130 movu [r0 + 503 * 16], m3
17131
17132 ; mode 9 [row 28]
17133 movu m6, [r5 + 26 * 16]
17134 pmaddubsw m3, m0, m6
17135 pmulhrsw m3, m7
17136 pmaddubsw m5, m2, m6
17137 pmulhrsw m5, m7
17138 packuswb m3, m5
17139 movu [r0 + 504 * 16], m3
17140 pmaddubsw m3, m1, m6
17141 pmulhrsw m3, m7
17142 pmaddubsw m5, m4, m6
17143 pmulhrsw m5, m7
17144 packuswb m3, m5
17145 movu [r0 + 505 * 16], m3
17146
17147 ; mode 9 [row 30]
17148 movu m6, [r5 + 30 * 16]
17149 pmaddubsw m3, m0, m6
17150 pmulhrsw m3, m7
17151 pmaddubsw m5, m2, m6
17152 pmulhrsw m5, m7
17153 packuswb m3, m5
17154 movu [r0 + 508 * 16], m3
17155 pmaddubsw m3, m1, m6
17156 pmulhrsw m3, m7
17157 pmaddubsw m5, m4, m6
17158 pmulhrsw m5, m7
17159 packuswb m3, m5
17160 movu [r0 + 509 * 16], m3
17161
17162 ; mode 8 [row 7]
17163 movu m6, [r5 + 8 * 16]
17164 pmaddubsw m3, m0, m6
17165 pmulhrsw m3, m7
17166 pmaddubsw m5, m2, m6
17167 pmulhrsw m5, m7
17168 packuswb m3, m5
17169 movu [r0 + 398 * 16], m3
17170
17171 ; mode 9 [row 19 - first half]
17172 movu [r0 + 486 * 16], m3
17173
17174 pmaddubsw m3, m1, m6
17175 pmulhrsw m3, m7
17176 pmaddubsw m5, m4, m6
17177 pmulhrsw m5, m7
17178 packuswb m3, m5
17179 movu [r0 + 399 * 16], m3
17180
17181 ; mode 9 [row 19 - second half]
17182 movu [r0 + 487 * 16], m3
17183
17184 ; mode 8 [row 9]
17185 movu m6, [r5 + 18 * 16]
17186 pmaddubsw m3, m0, m6
17187 pmulhrsw m3, m7
17188 pmaddubsw m5, m2, m6
17189 pmulhrsw m5, m7
17190 packuswb m3, m5
17191 movu [r0 + 402 * 16], m3
17192
17193 ; mode 9 [row 24 - first half]
17194 movu [r0 + 496 * 16], m3
17195
17196 pmaddubsw m3, m1, m6
17197 pmulhrsw m3, m7
17198 pmaddubsw m5, m4, m6
17199 pmulhrsw m5, m7
17200 packuswb m3, m5
17201 movu [r0 + 403 * 16], m3
17202
17203 ; mode 9 [row 24 - second half]
17204 movu [r0 + 497 * 16], m3
17205
17206 ; mode 8 [row 10]
17207 movu m6, [r5 + 23 * 16]
17208 pmaddubsw m3, m0, m6
17209 pmulhrsw m3, m7
17210 pmaddubsw m5, m2, m6
17211 pmulhrsw m5, m7
17212 packuswb m3, m5
17213 movu [r0 + 404 * 16], m3
17214 pmaddubsw m3, m1, m6
17215 pmulhrsw m3, m7
17216 pmaddubsw m5, m4, m6
17217 pmulhrsw m5, m7
17218 packuswb m3, m5
17219 movu [r0 + 405 * 16], m3
17220
17221 ; mode 8 [row 11]
17222 movu m6, [r5 + 28 * 16]
17223 pmaddubsw m3, m0, m6
17224 pmulhrsw m3, m7
17225 pmaddubsw m5, m2, m6
17226 pmulhrsw m5, m7
17227 packuswb m3, m5
17228 movu [r0 + 406 * 16], m3
17229
17230 ; mode 9 [row 29 - first half]
17231 movu [r0 + 506 * 16], m3
17232
17233 pmaddubsw m3, m1, m6
17234 pmulhrsw m3, m7
17235 pmaddubsw m5, m4, m6
17236 pmulhrsw m5, m7
17237 packuswb m3, m5
17238 movu [r0 + 407 * 16], m3
17239
17240 ; mode 9 [row 29 - second half]
17241 movu [r0 + 507 * 16], m3
17242
17243 ; mode 3 [row 2]
17244 movu m6, [r5 + 14 * 16]
17245 movu m0, [r4 + 3]
17246 movd m1, [r4 + 4]
17247 palignr m1, m0, 1
17248 punpcklbw m0, m1
17249 pmaddubsw m1, m0, m6
17250 pmulhrsw m1, m7
17251 movu m2, [r4 + 11]
17252 movd m3, [r4 + 12]
17253 palignr m3, m2, 1
17254 punpcklbw m2, m3
17255 pmaddubsw m3, m2, m6
17256 pmulhrsw m3, m7
17257 packuswb m1, m3
17258 movu [r0 + 68 * 16], m1
17259
17260 ; mode 3 [row 2 - first half]
17261 movu [r0 + 266 * 16], m1
17262
17263 movu m1, [r4 + 19]
17264 movd m3, [r4 + 20]
17265 palignr m3, m1, 1
17266 punpcklbw m1, m3
17267 pmaddubsw m3, m1, m6
17268 pmulhrsw m3, m7
17269 movu m4, [r4 + 27]
17270 movd m5, [r4 + 28]
17271 palignr m5, m4, 1
17272 punpcklbw m4, m5
17273 pmaddubsw m5, m4, m6
17274 pmulhrsw m5, m7
17275 packuswb m3, m5
17276 movu [r0 + 69 * 16], m3
17277
17278 ; mode 3 [row 2 - second half]
17279 movu [r0 + 267 * 16], m3
17280
17281 ; mode 4 [row 3]
17282 movu m6, [r5 + 20 * 16]
17283 pmaddubsw m3, m0, m6
17284 pmulhrsw m3, m7
17285 pmaddubsw m5, m2, m6
17286 pmulhrsw m5, m7
17287 packuswb m3, m5
17288 movu [r0 + 134 * 16], m3
17289 pmaddubsw m3, m1, m6
17290 pmulhrsw m3, m7
17291 pmaddubsw m5, m4, m6
17292 pmulhrsw m5, m7
17293 packuswb m3, m5
17294 movu [r0 + 135 * 16], m3
17295
17296 ; mode 5 [row 3]
17297 movu m6, [r5 + 4 * 16]
17298 pmaddubsw m3, m0, m6
17299 pmulhrsw m3, m7
17300 pmaddubsw m5, m2, m6
17301 pmulhrsw m5, m7
17302 packuswb m3, m5
17303 movu [r0 + 198 * 16], m3
17304 pmaddubsw m3, m1, m6
17305 pmulhrsw m3, m7
17306 pmaddubsw m5, m4, m6
17307 pmulhrsw m5, m7
17308 packuswb m3, m5
17309 movu [r0 + 199 * 16], m3
17310
17311 ; mode 5 [row 4]
17312 movu m6, [r5 + 21 * 16]
17313 pmaddubsw m3, m0, m6
17314 pmulhrsw m3, m7
17315 pmaddubsw m5, m2, m6
17316 pmulhrsw m5, m7
17317 packuswb m3, m5
17318 movu [r0 + 200 * 16], m3
17319
17320 ; mode 8 [row 16 - first half]
17321 movu [r0 + 416 * 16], m3
17322
17323 pmaddubsw m3, m1, m6
17324 pmulhrsw m3, m7
17325 pmaddubsw m5, m4, m6
17326 pmulhrsw m5, m7
17327 packuswb m3, m5
17328 movu [r0 + 201 * 16], m3
17329
17330 ; mode 8 [row 16 - second half]
17331 movu [r0 + 417 * 16], m3
17332
17333 ; mode 6 [row 4]
17334 movu m6, [r5 + 1 * 16]
17335 pmaddubsw m3, m0, m6
17336 pmulhrsw m3, m7
17337 pmaddubsw m5, m2, m6
17338 pmulhrsw m5, m7
17339 packuswb m3, m5
17340 movu [r0 + 264 * 16], m3
17341
17342 ; mode 6 [row 4 - first half]
17343 movu [r0 + 408 * 16], m3
17344
17345 pmaddubsw m3, m1, m6
17346 pmulhrsw m3, m7
17347 pmaddubsw m5, m4, m6
17348 pmulhrsw m5, m7
17349 packuswb m3, m5
17350 movu [r0 + 265 * 16], m3
17351
17352 ; mode 6 [row 4 - second half]
17353 movu [r0 + 409 * 16], m3
17354
17355 ; mode 6 [row 6]
17356 movu m6, [r5 + 27 * 16]
17357 pmaddubsw m3, m0, m6
17358 pmulhrsw m3, m7
17359 pmaddubsw m5, m2, m6
17360 pmulhrsw m5, m7
17361 packuswb m3, m5
17362 movu [r0 + 268 * 16], m3
17363 pmaddubsw m3, m1, m6
17364 pmulhrsw m3, m7
17365 pmaddubsw m5, m4, m6
17366 pmulhrsw m5, m7
17367 packuswb m3, m5
17368 movu [r0 + 269 * 16], m3
17369
17370 ; mode 7 [row 7]
17371 movu m6, [r5 + 8 * 16]
17372 pmaddubsw m3, m0, m6
17373 pmulhrsw m3, m7
17374 pmaddubsw m5, m2, m6
17375 pmulhrsw m5, m7
17376 packuswb m3, m5
17377 movu [r0 + 334 * 16], m3
17378 pmaddubsw m3, m1, m6
17379 pmulhrsw m3, m7
17380 pmaddubsw m5, m4, m6
17381 pmulhrsw m5, m7
17382 packuswb m3, m5
17383 movu [r0 + 335 * 16], m3
17384
17385 ; mode 7 [row 8]
17386 movu m6, [r5 + 17 * 16]
17387 pmaddubsw m3, m0, m6
17388 pmulhrsw m3, m7
17389 pmaddubsw m5, m2, m6
17390 pmulhrsw m5, m7
17391 packuswb m3, m5
17392 movu [r0 + 336 * 16], m3
17393 pmaddubsw m3, m1, m6
17394 pmulhrsw m3, m7
17395 pmaddubsw m5, m4, m6
17396 pmulhrsw m5, m7
17397 packuswb m3, m5
17398 movu [r0 + 337 * 16], m3
17399
17400 ; mode 7 [row 9]
17401 movu m6, [r5 + 26 * 16]
17402 pmaddubsw m3, m0, m6
17403 pmulhrsw m3, m7
17404 pmaddubsw m5, m2, m6
17405 pmulhrsw m5, m7
17406 packuswb m3, m5
17407 movu [r0 + 338 * 16], m3
17408
17409 ; mode 8 [row 17 - first half]
17410 movu [r0 + 418 * 16], m3
17411
17412 pmaddubsw m3, m1, m6
17413 pmulhrsw m3, m7
17414 pmaddubsw m5, m4, m6
17415 pmulhrsw m5, m7
17416 packuswb m3, m5
17417 movu [r0 + 339 * 16], m3
17418
17419 ; mode 8 [row 17 - second half]
17420 movu [r0 + 419 * 16], m3
17421
17422 ; mode 8 [row 13]
17423 movu m6, [r5 + 6 * 16]
17424 pmaddubsw m3, m0, m6
17425 pmulhrsw m3, m7
17426 pmaddubsw m5, m2, m6
17427 pmulhrsw m5, m7
17428 packuswb m3, m5
17429 movu [r0 + 410 * 16], m3
17430 pmaddubsw m3, m1, m6
17431 pmulhrsw m3, m7
17432 pmaddubsw m5, m4, m6
17433 pmulhrsw m5, m7
17434 packuswb m3, m5
17435 movu [r0 + 411 * 16], m3
17436
17437 ; mode 8 [row 14]
17438 movu m6, [r5 + 11 * 16]
17439 pmaddubsw m3, m0, m6
17440 pmulhrsw m3, m7
17441 pmaddubsw m5, m2, m6
17442 pmulhrsw m5, m7
17443 packuswb m3, m5
17444 movu [r0 + 412 * 16], m3
17445 pmaddubsw m3, m1, m6
17446 pmulhrsw m3, m7
17447 pmaddubsw m5, m4, m6
17448 pmulhrsw m5, m7
17449 packuswb m3, m5
17450 movu [r0 + 413 * 16], m3
17451
17452 ; mode 8 [row 15]
17453 movu m6, [r5 + 16 * 16]
17454 pmaddubsw m3, m0, m6
17455 pmulhrsw m3, m7
17456 pmaddubsw m5, m2, m6
17457 pmulhrsw m5, m7
17458 packuswb m3, m5
17459 movu [r0 + 414 * 16], m3
17460 pmaddubsw m3, m1, m6
17461 pmulhrsw m3, m7
17462 pmaddubsw m5, m4, m6
17463 pmulhrsw m5, m7
17464 packuswb m3, m5
17465 movu [r0 + 415 * 16], m3
17466
17467 ; mode 8 [row 18]
17468 movu m6, [r5 + 31 * 16]
17469 pmaddubsw m3, m0, m6
17470 pmulhrsw m3, m7
17471 pmaddubsw m5, m2, m6
17472 pmulhrsw m5, m7
17473 packuswb m3, m5
17474 movu [r0 + 420 * 16], m3
17475 pmaddubsw m3, m1, m6
17476 pmulhrsw m3, m7
17477 pmaddubsw m5, m4, m6
17478 pmulhrsw m5, m7
17479 packuswb m3, m5
17480 movu [r0 + 421 * 16], m3
17481
17482 ; mode 3 [row 3]
17483 movu m6, [r5 + 8 * 16]
17484 movu m0, [r4 + 4]
17485 movd m1, [r4 + 5]
17486 palignr m1, m0, 1
17487 punpcklbw m0, m1
17488 pmaddubsw m1, m0, m6
17489 pmulhrsw m1, m7
17490 movu m2, [r4 + 12]
17491 movd m3, [r4 + 13]
17492 palignr m3, m2, 1
17493 punpcklbw m2, m3
17494 pmaddubsw m3, m2, m6
17495 pmulhrsw m3, m7
17496 packuswb m1, m3
17497 movu [r0 + 70 * 16], m1
17498
17499 ; mode 6 [row 7 - first half]
17500 movu [r0 + 270 * 16], m1
17501
17502 movu m1, [r4 + 20]
17503 movd m3, [r4 + 21]
17504 palignr m3, m1, 1
17505 punpcklbw m1, m3
17506 pmaddubsw m3, m1, m6
17507 pmulhrsw m3, m7
17508 movu m4, [r4 + 28]
17509 movd m5, [r4 + 29]
17510 palignr m5, m4, 1
17511 punpcklbw m4, m5
17512 pmaddubsw m5, m4, m6
17513 pmulhrsw m5, m7
17514 packuswb m3, m5
17515 movu [r0 + 71 * 16], m3
17516
17517 ; mode 6 [row 7 - second half]
17518 movu [r0 + 271 * 16], m3
17519
17520 ; mode 4 [row 4]
17521 movu m6, [r5 + 9 * 16]
17522 pmaddubsw m3, m0, m6
17523 pmulhrsw m3, m7
17524 pmaddubsw m5, m2, m6
17525 pmulhrsw m5, m7
17526 packuswb m3, m5
17527 movu [r0 + 136 * 16], m3
17528
17529 ; mode 4 [row 4 - first half]
17530 movu [r0 + 424 * 16], m3
17531
17532 pmaddubsw m3, m1, m6
17533 pmulhrsw m3, m7
17534 pmaddubsw m5, m4, m6
17535 pmulhrsw m5, m7
17536 packuswb m3, m5
17537 movu [r0 + 137 * 16], m3
17538
17539 ; mode 4 [row 4 - second half]
17540 movu [r0 + 425 * 16], m3
17541
17542 ; mode 4 [row 5]
17543 movu m6, [r5 + 30 * 16]
17544 pmaddubsw m3, m0, m6
17545 pmulhrsw m3, m7
17546 pmaddubsw m5, m2, m6
17547 pmulhrsw m5, m7
17548 packuswb m3, m5
17549 movu [r0 + 138 * 16], m3
17550
17551 ; mode 7 [row 13 - first half]
17552 movu [r0 + 346 * 16], m3
17553
17554 pmaddubsw m3, m1, m6
17555 pmulhrsw m3, m7
17556 pmaddubsw m5, m4, m6
17557 pmulhrsw m5, m7
17558 packuswb m3, m5
17559 movu [r0 + 139 * 16], m3
17560
17561 ; mode 7 [row 13 - second half]
17562 movu [r0 + 347 * 16], m3
17563
17564 ; mode 5 [row 5]
17565 movu m6, [r5 + 6 * 16]
17566 pmaddubsw m3, m0, m6
17567 pmulhrsw m3, m7
17568 pmaddubsw m5, m2, m6
17569 pmulhrsw m5, m7
17570 packuswb m3, m5
17571 movu [r0 + 202 * 16], m3
17572 pmaddubsw m3, m1, m6
17573 pmulhrsw m3, m7
17574 pmaddubsw m5, m4, m6
17575 pmulhrsw m5, m7
17576 packuswb m3, m5
17577 movu [r0 + 203 * 16], m3
17578
17579 ; mode 5 [row 6]
17580 movu m6, [r5 + 23 * 16]
17581 pmaddubsw m3, m0, m6
17582 pmulhrsw m3, m7
17583 pmaddubsw m5, m2, m6
17584 pmulhrsw m5, m7
17585 packuswb m3, m5
17586 movu [r0 + 204 * 16], m3
17587 pmaddubsw m3, m1, m6
17588 pmulhrsw m3, m7
17589 pmaddubsw m5, m4, m6
17590 pmulhrsw m5, m7
17591 packuswb m3, m5
17592 movu [r0 + 205 * 16], m3
17593
17594 ; mode 6 [row 8]
17595 movu m6, [r5 + 21 * 16]
17596 pmaddubsw m3, m0, m6
17597 pmulhrsw m3, m7
17598 pmaddubsw m5, m2, m6
17599 pmulhrsw m5, m7
17600 packuswb m3, m5
17601 movu [r0 + 272 * 16], m3
17602
17603 ; mode 7 [row 12 - first half]
17604 movu [r0 + 344 * 16], m3
17605
17606 pmaddubsw m3, m1, m6
17607 pmulhrsw m3, m7
17608 pmaddubsw m5, m4, m6
17609 pmulhrsw m5, m7
17610 packuswb m3, m5
17611 movu [r0 + 273 * 16], m3
17612
17613 ; mode 7 [row 12 - second half]
17614 movu [r0 + 345 * 16], m3
17615
17616 ; mode 7 [row 10]
17617 movu m6, [r5 + 3 * 16]
17618 pmaddubsw m3, m0, m6
17619 pmulhrsw m3, m7
17620 pmaddubsw m5, m2, m6
17621 pmulhrsw m5, m7
17622 packuswb m3, m5
17623 movu [r0 + 340 * 16], m3
17624 pmaddubsw m3, m1, m6
17625 pmulhrsw m3, m7
17626 pmaddubsw m5, m4, m6
17627 pmulhrsw m5, m7
17628 packuswb m3, m5
17629 movu [r0 + 341 * 16], m3
17630
17631 ; mode 7 [row 11]
17632 movu m6, [r5 + 12 * 16]
17633 pmaddubsw m3, m0, m6
17634 pmulhrsw m3, m7
17635 pmaddubsw m5, m2, m6
17636 pmulhrsw m5, m7
17637 packuswb m3, m5
17638 movu [r0 + 342 * 16], m3
17639 pmaddubsw m3, m1, m6
17640 pmulhrsw m3, m7
17641 pmaddubsw m5, m4, m6
17642 pmulhrsw m5, m7
17643 packuswb m3, m5
17644 movu [r0 + 343 * 16], m3
17645
17646 ; mode 8 [row 19]
17647 movu m6, [r5 + 4 * 16]
17648 pmaddubsw m3, m0, m6
17649 pmulhrsw m3, m7
17650 pmaddubsw m5, m2, m6
17651 pmulhrsw m5, m7
17652 packuswb m3, m5
17653 movu [r0 + 422 * 16], m3
17654 pmaddubsw m3, m1, m6
17655 pmulhrsw m3, m7
17656 pmaddubsw m5, m4, m6
17657 pmulhrsw m5, m7
17658 packuswb m3, m5
17659 movu [r0 + 423 * 16], m3
17660
17661 ; mode 8 [row 21]
17662 movu m6, [r5 + 14 * 16]
17663 pmaddubsw m3, m0, m6
17664 pmulhrsw m3, m7
17665 pmaddubsw m5, m2, m6
17666 pmulhrsw m5, m7
17667 packuswb m3, m5
17668 movu [r0 + 426 * 16], m3
17669 pmaddubsw m3, m1, m6
17670 pmulhrsw m3, m7
17671 pmaddubsw m5, m4, m6
17672 pmulhrsw m5, m7
17673 packuswb m3, m5
17674 movu [r0 + 427 * 16], m3
17675
17676 ; mode 8 [row 22]
17677 movu m6, [r5 + 19 * 16]
17678 pmaddubsw m3, m0, m6
17679 pmulhrsw m3, m7
17680 pmaddubsw m5, m2, m6
17681 pmulhrsw m5, m7
17682 packuswb m3, m5
17683 movu [r0 + 428 * 16], m3
17684 pmaddubsw m3, m1, m6
17685 pmulhrsw m3, m7
17686 pmaddubsw m5, m4, m6
17687 pmulhrsw m5, m7
17688 packuswb m3, m5
17689 movu [r0 + 429 * 16], m3
17690
17691 ; mode 8 [row 23]
17692 movu m6, [r5 + 24 * 16]
17693 pmaddubsw m3, m0, m6
17694 pmulhrsw m3, m7
17695 pmaddubsw m5, m2, m6
17696 pmulhrsw m5, m7
17697 packuswb m3, m5
17698 movu [r0 + 430 * 16], m3
17699 pmaddubsw m3, m1, m6
17700 pmulhrsw m3, m7
17701 pmaddubsw m5, m4, m6
17702 pmulhrsw m5, m7
17703 packuswb m3, m5
17704 movu [r0 + 431 * 16], m3
17705
17706 ; mode 8 [row 24]
17707 movu m6, [r5 + 29 * 16]
17708 pmaddubsw m3, m0, m6
17709 pmulhrsw m3, m7
17710 pmaddubsw m5, m2, m6
17711 pmulhrsw m5, m7
17712 packuswb m3, m5
17713 movu [r0 + 432 * 16], m3
17714 pmaddubsw m3, m1, m6
17715 pmulhrsw m3, m7
17716 pmaddubsw m5, m4, m6
17717 pmulhrsw m5, m7
17718 packuswb m3, m5
17719 movu [r0 + 433 * 16], m3
17720
17721 ; mode 3 [row 4]
17722 movu m6, [r5 + 2 * 16]
17723 movu m0, [r4 + 5]
17724 movd m1, [r4 + 6]
17725 palignr m1, m0, 1
17726 punpcklbw m0, m1
17727 pmaddubsw m1, m0, m6
17728 pmulhrsw m1, m7
17729 movu m2, [r4 + 13]
17730 movd m3, [r4 + 14]
17731 palignr m3, m2, 1
17732 punpcklbw m2, m3
17733 pmaddubsw m3, m2, m6
17734 pmulhrsw m3, m7
17735 packuswb m1, m3
17736 movu [r0 + 72 * 16], m1
17737
17738 ; mode 3 [row 4 - first half]
17739 movu [r0 + 274 * 16], m1
17740
17741 ; mode 8 [row 25 - first half]
17742 movu [r0 + 434 * 16], m1
17743
17744 movu m1, [r4 + 21]
17745 movd m3, [r4 + 22]
17746 palignr m3, m1, 1
17747 punpcklbw m1, m3
17748 pmaddubsw m3, m1, m6
17749 pmulhrsw m3, m7
17750 movu m4, [r4 + 29]
17751 movd m5, [r4 + 30]
17752 palignr m5, m4, 1
17753 punpcklbw m4, m5
17754 pmaddubsw m5, m4, m6
17755 pmulhrsw m5, m7
17756 packuswb m3, m5
17757 movu [r0 + 73 * 16], m3
17758
17759 ; mode 3 [row 4 - second half]
17760 movu [r0 + 275 * 16], m3
17761
17762 ; mode 8 [row 25 - second half]
17763 movu [r0 + 435 * 16], m3
17764
17765 ; mode 3 [row 5]
17766 movu m6, [r5 + 28 * 16]
17767 pmaddubsw m3, m0, m6
17768 pmulhrsw m3, m7
17769 pmaddubsw m5, m2, m6
17770 pmulhrsw m5, m7
17771 packuswb m3, m5
17772 movu [r0 + 74 * 16], m3
17773
17774 ; mode 3 [row 5 - first half]
17775 movu [r0 + 278 * 16], m3
17776
17777 pmaddubsw m3, m1, m6
17778 pmulhrsw m3, m7
17779 pmaddubsw m5, m4, m6
17780 pmulhrsw m5, m7
17781 packuswb m3, m5
17782 movu [r0 + 75 * 16], m3
17783
17784 ; mode 3 [row 5 - second half]
17785 movu [r0 + 279 * 16], m3
17786
17787 ; mode 4 [row 6]
17788 movu m6, [r5 + 19 * 16]
17789 pmaddubsw m3, m0, m6
17790 pmulhrsw m3, m7
17791 pmaddubsw m5, m2, m6
17792 pmulhrsw m5, m7
17793 packuswb m3, m5
17794 movu [r0 + 140 * 16], m3
17795 pmaddubsw m3, m1, m6
17796 pmulhrsw m3, m7
17797 pmaddubsw m5, m4, m6
17798 pmulhrsw m5, m7
17799 packuswb m3, m5
17800 movu [r0 + 141 * 16], m3
17801
17802 ; mode 5 [row 7]
17803 movu m6, [r5 + 8 * 16]
17804 pmaddubsw m3, m0, m6
17805 pmulhrsw m3, m7
17806 pmaddubsw m5, m2, m6
17807 pmulhrsw m5, m7
17808 packuswb m3, m5
17809 movu [r0 + 206 * 16], m3
17810 pmaddubsw m3, m1, m6
17811 pmulhrsw m3, m7
17812 pmaddubsw m5, m4, m6
17813 pmulhrsw m5, m7
17814 packuswb m3, m5
17815 movu [r0 + 207 * 16], m3
17816
17817 ; mode 5 [row 8]
17818 movu m6, [r5 + 25 * 16]
17819 pmaddubsw m3, m0, m6
17820 pmulhrsw m3, m7
17821 pmaddubsw m5, m2, m6
17822 pmulhrsw m5, m7
17823 packuswb m3, m5
17824 movu [r0 + 208 * 16], m3
17825
17826 ; mode 7 [row 16 - first half]
17827 movu [r0 + 352 * 16], m3
17828
17829 pmaddubsw m3, m1, m6
17830 pmulhrsw m3, m7
17831 pmaddubsw m5, m4, m6
17832 pmulhrsw m5, m7
17833 packuswb m3, m5
17834 movu [r0 + 209 * 16], m3
17835
17836 ; mode 7 [row 16 - second half]
17837 movu [r0 + 353 * 16], m3
17838
17839 ; mode 6 [row 10]
17840 movu m6, [r5 + 15 * 16]
17841 pmaddubsw m3, m0, m6
17842 pmulhrsw m3, m7
17843 pmaddubsw m5, m2, m6
17844 pmulhrsw m5, m7
17845 packuswb m3, m5
17846 movu [r0 + 276 * 16], m3
17847 pmaddubsw m3, m1, m6
17848 pmulhrsw m3, m7
17849 pmaddubsw m5, m4, m6
17850 pmulhrsw m5, m7
17851 packuswb m3, m5
17852 movu [r0 + 277 * 16], m3
17853
17854 ; mode 7 [row 14]
17855 movu m6, [r5 + 7 * 16]
17856 pmaddubsw m3, m0, m6
17857 pmulhrsw m3, m7
17858 pmaddubsw m5, m2, m6
17859 pmulhrsw m5, m7
17860 packuswb m3, m5
17861 movu [r0 + 348 * 16], m3
17862
17863 ; mode 8 [row 26 - first half]
17864 movu [r0 + 436 * 16], m3
17865
17866 pmaddubsw m3, m1, m6
17867 pmulhrsw m3, m7
17868 pmaddubsw m5, m4, m6
17869 pmulhrsw m5, m7
17870 packuswb m3, m5
17871 movu [r0 + 349 * 16], m3
17872
17873 ; mode 8 [row 26 - second half]
17874 movu [r0 + 437 * 16], m3
17875
17876 ; mode 7 [row 15]
17877 movu m6, [r5 + 16 * 16]
17878 pmaddubsw m3, m0, m6
17879 pmulhrsw m3, m7
17880 pmaddubsw m5, m2, m6
17881 pmulhrsw m5, m7
17882 packuswb m3, m5
17883 movu [r0 + 350 * 16], m3
17884 pmaddubsw m3, m1, m6
17885 pmulhrsw m3, m7
17886 pmaddubsw m5, m4, m6
17887 pmulhrsw m5, m7
17888 packuswb m3, m5
17889 movu [r0 + 351 * 16], m3
17890
17891 ; mode 8 [row 27]
17892 movu m6, [r5 + 12 * 16]
17893 pmaddubsw m3, m0, m6
17894 pmulhrsw m3, m7
17895 pmaddubsw m5, m2, m6
17896 pmulhrsw m5, m7
17897 packuswb m3, m5
17898 movu [r0 + 438 * 16], m3
17899 pmaddubsw m3, m1, m6
17900 pmulhrsw m3, m7
17901 pmaddubsw m5, m4, m6
17902 pmulhrsw m5, m7
17903 packuswb m3, m5
17904 movu [r0 + 439 * 16], m3
17905
17906 ; mode 8 [row 28]
17907 movu m6, [r5 + 17 * 16]
17908 pmaddubsw m3, m0, m6
17909 pmulhrsw m3, m7
17910 pmaddubsw m5, m2, m6
17911 pmulhrsw m5, m7
17912 packuswb m3, m5
17913 movu [r0 + 440 * 16], m3
17914 pmaddubsw m3, m1, m6
17915 pmulhrsw m3, m7
17916 pmaddubsw m5, m4, m6
17917 pmulhrsw m5, m7
17918 packuswb m3, m5
17919 movu [r0 + 441 * 16], m3
17920
17921 ; mode 8 [row 29]
17922 movu m6, [r5 + 22 * 16]
17923 pmaddubsw m3, m0, m6
17924 pmulhrsw m3, m7
17925 pmaddubsw m5, m2, m6
17926 pmulhrsw m5, m7
17927 packuswb m3, m5
17928 movu [r0 + 442 * 16], m3
17929 pmaddubsw m3, m1, m6
17930 pmulhrsw m3, m7
17931 pmaddubsw m5, m4, m6
17932 pmulhrsw m5, m7
17933 packuswb m3, m5
17934 movu [r0 + 443 * 16], m3
17935
17936 ; mode 8 [row 30]
17937 movu m6, [r5 + 27 * 16]
17938 pmaddubsw m3, m0, m6
17939 pmulhrsw m3, m7
17940 pmaddubsw m5, m2, m6
17941 pmulhrsw m5, m7
17942 packuswb m3, m5
17943 movu [r0 + 444 * 16], m3
17944 pmaddubsw m3, m1, m6
17945 pmulhrsw m3, m7
17946 pmaddubsw m5, m4, m6
17947 pmulhrsw m5, m7
17948 packuswb m3, m5
17949 movu [r0 + 445 * 16], m3
17950
17951 ; mode 3 [row 6]
17952 movu m6, [r5 + 22 * 16]
17953 movu m0, [r4 + 6]
17954 movd m1, [r4 + 7]
17955 palignr m1, m0, 1
17956 punpcklbw m0, m1
17957 pmaddubsw m1, m0, m6
17958 pmulhrsw m1, m7
17959 movu m2, [r4 + 14]
17960 movd m3, [r4 + 15]
17961 palignr m3, m2, 1
17962 punpcklbw m2, m3
17963 pmaddubsw m3, m2, m6
17964 pmulhrsw m3, m7
17965 packuswb m1, m3
17966 movu [r0 + 76 * 16], m1
17967
17968 ; mode 6 [row 13 - first half]
17969 movu [r0 + 282 * 16], m1
17970
17971 movu m1, [r4 + 22]
17972 movd m3, [r4 + 23]
17973 palignr m3, m1, 1
17974 punpcklbw m1, m3
17975 pmaddubsw m3, m1, m6
17976 pmulhrsw m3, m7
17977 movu m4, [r4 + 30]
17978 movd m5, [r4 + 31]
17979 palignr m5, m4, 1
17980 punpcklbw m4, m5
17981 pmaddubsw m5, m4, m6
17982 pmulhrsw m5, m7
17983 packuswb m3, m5
17984 movu [r0 + 77 * 16], m3
17985
17986 ; mode 6 [row 13 - second half]
17987 movu [r0 + 283 * 16], m3
17988
17989 ; mode 4 [row 7]
17990 movu m6, [r5 + 8 * 16]
17991 pmaddubsw m3, m0, m6
17992 pmulhrsw m3, m7
17993 pmaddubsw m5, m2, m6
17994 pmulhrsw m5, m7
17995 packuswb m3, m5
17996 movu [r0 + 142 * 16], m3
17997 pmaddubsw m3, m1, m6
17998 pmulhrsw m3, m7
17999 pmaddubsw m5, m4, m6
18000 pmulhrsw m5, m7
18001 packuswb m3, m5
18002 movu [r0 + 143 * 16], m3
18003
18004 ; mode 4 [row 8]
18005 movu m6, [r5 + 29 * 16]
18006 pmaddubsw m3, m0, m6
18007 pmulhrsw m3, m7
18008 pmaddubsw m5, m2, m6
18009 pmulhrsw m5, m7
18010 packuswb m3, m5
18011 movu [r0 + 144 * 16], m3
18012
18013 ; mode 4 [row 8 - first half]
18014 movu [r0 + 360 * 16], m3
18015
18016 pmaddubsw m3, m1, m6
18017 pmulhrsw m3, m7
18018 pmaddubsw m5, m4, m6
18019 pmulhrsw m5, m7
18020 packuswb m3, m5
18021 movu [r0 + 145 * 16], m3
18022
18023 ; mode 4 [row 8 - second half]
18024 movu [r0 + 361 * 16], m3
18025
18026 ; mode 5 [row 9]
18027 movu m6, [r5 + 10 * 16]
18028 pmaddubsw m3, m0, m6
18029 pmulhrsw m3, m7
18030 pmaddubsw m5, m2, m6
18031 pmulhrsw m5, m7
18032 packuswb m3, m5
18033 movu [r0 + 210 * 16], m3
18034 pmaddubsw m3, m1, m6
18035 pmulhrsw m3, m7
18036 pmaddubsw m5, m4, m6
18037 pmulhrsw m5, m7
18038 packuswb m3, m5
18039 movu [r0 + 211 * 16], m3
18040
18041 ; mode 5 [row 10]
18042 movu m6, [r5 + 27 * 16]
18043 pmaddubsw m3, m0, m6
18044 pmulhrsw m3, m7
18045 pmaddubsw m5, m2, m6
18046 pmulhrsw m5, m7
18047 packuswb m3, m5
18048 movu [r0 + 212 * 16], m3
18049 pmaddubsw m3, m1, m6
18050 pmulhrsw m3, m7
18051 pmaddubsw m5, m4, m6
18052 pmulhrsw m5, m7
18053 packuswb m3, m5
18054 movu [r0 + 213 * 16], m3
18055
18056 ; mode 7 [row 17]
18057 movu m6, [r5 + 2 * 16]
18058 pmaddubsw m3, m0, m6
18059 pmulhrsw m3, m7
18060 pmaddubsw m5, m2, m6
18061 pmulhrsw m5, m7
18062 packuswb m3, m5
18063 movu [r0 + 354 * 16], m3
18064 pmaddubsw m3, m1, m6
18065 pmulhrsw m3, m7
18066 pmaddubsw m5, m4, m6
18067 pmulhrsw m5, m7
18068 packuswb m3, m5
18069 movu [r0 + 355 * 16], m3
18070
18071 ; mode 7 [row 18]
18072 movu m6, [r5 + 11 * 16]
18073 pmaddubsw m3, m0, m6
18074 pmulhrsw m3, m7
18075 pmaddubsw m5, m2, m6
18076 pmulhrsw m5, m7
18077 packuswb m3, m5
18078 movu [r0 + 356 * 16], m3
18079 pmaddubsw m3, m1, m6
18080 pmulhrsw m3, m7
18081 pmaddubsw m5, m4, m6
18082 pmulhrsw m5, m7
18083 packuswb m3, m5
18084 movu [r0 + 357 * 16], m3
18085
18086 ; mode 7 [row 19]
18087 movu m6, [r5 + 20 * 16]
18088 pmaddubsw m3, m0, m6
18089 pmulhrsw m3, m7
18090 pmaddubsw m5, m2, m6
18091 pmulhrsw m5, m7
18092 packuswb m3, m5
18093 movu [r0 + 358 * 16], m3
18094 pmaddubsw m3, m1, m6
18095 pmulhrsw m3, m7
18096 pmaddubsw m5, m4, m6
18097 pmulhrsw m5, m7
18098 packuswb m3, m5
18099 movu [r0 + 359 * 16], m3
18100
18101 ; mode 6 [row 12]
18102 movu m6, [r5 + 9 * 16]
18103 pmaddubsw m3, m0, m6
18104 pmulhrsw m3, m7
18105 pmaddubsw m5, m2, m6
18106 pmulhrsw m5, m7
18107 packuswb m3, m5
18108 movu [r0 + 280 * 16], m3
18109 pmaddubsw m3, m1, m6
18110 pmulhrsw m3, m7
18111 pmaddubsw m5, m4, m6
18112 pmulhrsw m5, m7
18113 packuswb m3, m5
18114 movu [r0 + 281 * 16], m3
18115
18116 ; mode 3 [row 7]
18117 movu m6, [r5 + 16 * 16]
18118 movu m0, [r4 + 7]
18119 movd m1, [r4 + 8]
18120 palignr m1, m0, 1
18121 punpcklbw m0, m1
18122 pmaddubsw m1, m0, m6
18123 pmulhrsw m1, m7
18124 movu m2, [r4 + 15]
18125 movd m3, [r4 + 16]
18126 palignr m3, m2, 1
18127 punpcklbw m2, m3
18128 pmaddubsw m3, m2, m6
18129 pmulhrsw m3, m7
18130 packuswb m1, m3
18131 movu [r0 + 78 * 16], m1
18132
18133 ; mode 6 [row 15 - first half]
18134 movu [r0 + 286 * 16], m1
18135
18136 movu m1, [r4 + 23]
18137 movd m3, [r4 + 24]
18138 palignr m3, m1, 1
18139 punpcklbw m1, m3
18140 pmaddubsw m3, m1, m6
18141 pmulhrsw m3, m7
18142 movu m4, [r4 + 31]
18143 movd m5, [r4 + 32]
18144 palignr m5, m4, 1
18145 punpcklbw m4, m5
18146 pmaddubsw m5, m4, m6
18147 pmulhrsw m5, m7
18148 packuswb m3, m5
18149 movu [r0 + 79 * 16], m3
18150
18151 ; mode 6 [row 15 - second half]
18152 movu [r0 + 287 * 16], m3
18153
18154 ; mode 4 [row 9]
18155 movu m6, [r5 + 18 * 16]
18156 pmaddubsw m3, m0, m6
18157 pmulhrsw m3, m7
18158 pmaddubsw m5, m2, m6
18159 pmulhrsw m5, m7
18160 packuswb m3, m5
18161 movu [r0 + 146 * 16], m3
18162 pmaddubsw m3, m1, m6
18163 pmulhrsw m3, m7
18164 pmaddubsw m5, m4, m6
18165 pmulhrsw m5, m7
18166 packuswb m3, m5
18167 movu [r0 + 147 * 16], m3
18168
18169 ; mode 5 [row 11]
18170 movu m6, [r5 + 12 * 16]
18171 pmaddubsw m3, m0, m6
18172 pmulhrsw m3, m7
18173 pmaddubsw m5, m2, m6
18174 pmulhrsw m5, m7
18175 packuswb m3, m5
18176 movu [r0 + 214 * 16], m3
18177 pmaddubsw m3, m1, m6
18178 pmulhrsw m3, m7
18179 pmaddubsw m5, m4, m6
18180 pmulhrsw m5, m7
18181 packuswb m3, m5
18182 movu [r0 + 215 * 16], m3
18183
18184 ; mode 5 [row 12]
18185 movu m6, [r5 + 29 * 16]
18186 pmaddubsw m3, m0, m6
18187 pmulhrsw m3, m7
18188 pmaddubsw m5, m2, m6
18189 pmulhrsw m5, m7
18190 packuswb m3, m5
18191 movu [r0 + 216 * 16], m3
18192
18193 ; mode 6 [row 16 - first half]
18194 movu [r0 + 288 * 16], m3
18195
18196 pmaddubsw m3, m1, m6
18197 pmulhrsw m3, m7
18198 pmaddubsw m5, m4, m6
18199 pmulhrsw m5, m7
18200 packuswb m3, m5
18201 movu [r0 + 217 * 16], m3
18202
18203 ; mode 6 [row 16 - second half]
18204 movu [r0 + 289 * 16], m3
18205
18206 ; mode 6 [row 14]
18207 movu m6, [r5 + 3 * 16]
18208 pmaddubsw m3, m0, m6
18209 pmulhrsw m3, m7
18210 pmaddubsw m5, m2, m6
18211 pmulhrsw m5, m7
18212 packuswb m3, m5
18213 movu [r0 + 284 * 16], m3
18214 pmaddubsw m3, m1, m6
18215 pmulhrsw m3, m7
18216 pmaddubsw m5, m4, m6
18217 pmulhrsw m5, m7
18218 packuswb m3, m5
18219 movu [r0 + 285 * 16], m3
18220
18221 ; mode 7 [row 21]
18222 movu m6, [r5 + 6 * 16]
18223 pmaddubsw m3, m0, m6
18224 pmulhrsw m3, m7
18225 pmaddubsw m5, m2, m6
18226 pmulhrsw m5, m7
18227 packuswb m3, m5
18228 movu [r0 + 362 * 16], m3
18229 pmaddubsw m3, m1, m6
18230 pmulhrsw m3, m7
18231 pmaddubsw m5, m4, m6
18232 pmulhrsw m5, m7
18233 packuswb m3, m5
18234 movu [r0 + 363 * 16], m3
18235
18236 ; mode 7 [row 22]
18237 movu m6, [r5 + 15 * 16]
18238 pmaddubsw m3, m0, m6
18239 pmulhrsw m3, m7
18240 pmaddubsw m5, m2, m6
18241 pmulhrsw m5, m7
18242 packuswb m3, m5
18243 movu [r0 + 364 * 16], m3
18244 pmaddubsw m3, m1, m6
18245 pmulhrsw m3, m7
18246 pmaddubsw m5, m4, m6
18247 pmulhrsw m5, m7
18248 packuswb m3, m5
18249 movu [r0 + 365 * 16], m3
18250
18251 ; mode 7 [row 23]
18252 movu m6, [r5 + 24 * 16]
18253 pmaddubsw m3, m0, m6
18254 pmulhrsw m3, m7
18255 pmaddubsw m5, m2, m6
18256 pmulhrsw m5, m7
18257 packuswb m3, m5
18258 movu [r0 + 366 * 16], m3
18259 pmaddubsw m3, m1, m6
18260 pmulhrsw m3, m7
18261 pmaddubsw m5, m4, m6
18262 pmulhrsw m5, m7
18263 packuswb m3, m5
18264 movu [r0 + 367 * 16], m3
18265
18266 ; mode 3 [row 8]
18267 movu m6, [r5 + 10 * 16]
18268 movu m0, [r4 + 8]
18269 movd m1, [r4 + 9]
18270 palignr m1, m0, 1
18271 punpcklbw m0, m1
18272 pmaddubsw m1, m0, m6
18273 pmulhrsw m1, m7
18274 movu m2, [r4 + 16]
18275 movd m3, [r4 + 17]
18276 palignr m3, m2, 1
18277 punpcklbw m2, m3
18278 pmaddubsw m3, m2, m6
18279 pmulhrsw m3, m7
18280 packuswb m1, m3
18281 movu [r0 + 80 * 16], m1
18282
18283 ; mode 7 [row 25 - first half]
18284 movu [r0 + 290 * 16], m1
18285
18286 ; mode 6 [row 17 - first half]
18287 movu [r0 + 370 * 16], m1
18288
18289 movu m1, [r4 + 24]
18290 movd m3, [r4 + 25]
18291 palignr m3, m1, 1
18292 punpcklbw m1, m3
18293 pmaddubsw m3, m1, m6
18294 pmulhrsw m3, m7
18295 movu m4, [r4 + 32]
18296 movd m5, [r4 + 33]
18297 palignr m5, m4, 1
18298 punpcklbw m4, m5
18299 pmaddubsw m5, m4, m6
18300 pmulhrsw m5, m7
18301 packuswb m3, m5
18302 movu [r0 + 81 * 16], m3
18303
18304 ; mode 7 [row 25 - second half]
18305 movu [r0 + 291 * 16], m3
18306
18307 ; mode 6 [row 17 - second half]
18308 movu [r0 + 371 * 16], m3
18309
18310 ; mode 4 [row 10]
18311 movu m6, [r5 + 7 * 16]
18312 pmaddubsw m3, m0, m6
18313 pmulhrsw m3, m7
18314 pmaddubsw m5, m2, m6
18315 pmulhrsw m5, m7
18316 packuswb m3, m5
18317 movu [r0 + 148 * 16], m3
18318 pmaddubsw m3, m1, m6
18319 pmulhrsw m3, m7
18320 pmaddubsw m5, m4, m6
18321 pmulhrsw m5, m7
18322 packuswb m3, m5
18323 movu [r0 + 149 * 16], m3
18324
18325 ; mode 4 [row 11]
18326 movu m6, [r5 + 28 * 16]
18327 pmaddubsw m3, m0, m6
18328 pmulhrsw m3, m7
18329 pmaddubsw m5, m2, m6
18330 pmulhrsw m5, m7
18331 packuswb m3, m5
18332 movu [r0 + 150 * 16], m3
18333
18334 ; mode 7 [row 27 - first half]
18335 movu [r0 + 374 * 16], m3
18336
18337 pmaddubsw m3, m1, m6
18338 pmulhrsw m3, m7
18339 pmaddubsw m5, m4, m6
18340 pmulhrsw m5, m7
18341 packuswb m3, m5
18342 movu [r0 + 151 * 16], m3
18343
18344 ; mode 7 [row 27 - second half]
18345 movu [r0 + 375 * 16], m3
18346
18347 ; mode 5 [row 13]
18348 movu m6, [r5 + 14 * 16]
18349 pmaddubsw m3, m0, m6
18350 pmulhrsw m3, m7
18351 pmaddubsw m5, m2, m6
18352 pmulhrsw m5, m7
18353 packuswb m3, m5
18354 movu [r0 + 218 * 16], m3
18355 pmaddubsw m3, m1, m6
18356 pmulhrsw m3, m7
18357 pmaddubsw m5, m4, m6
18358 pmulhrsw m5, m7
18359 packuswb m3, m5
18360 movu [r0 + 219 * 16], m3
18361
18362 ; mode 5 [row 14]
18363 movu m6, [r5 + 31 * 16]
18364 pmaddubsw m3, m0, m6
18365 pmulhrsw m3, m7
18366 pmaddubsw m5, m2, m6
18367 pmulhrsw m5, m7
18368 packuswb m3, m5
18369 movu [r0 + 220 * 16], m3
18370 pmaddubsw m3, m1, m6
18371 pmulhrsw m3, m7
18372 pmaddubsw m5, m4, m6
18373 pmulhrsw m5, m7
18374 packuswb m3, m5
18375 movu [r0 + 221 * 16], m3
18376
18377 ; mode 6 [row 18]
18378 movu m6, [r5 + 23 * 16]
18379 pmaddubsw m3, m0, m6
18380 pmulhrsw m3, m7
18381 pmaddubsw m5, m2, m6
18382 pmulhrsw m5, m7
18383 packuswb m3, m5
18384 movu [r0 + 292 * 16], m3
18385 pmaddubsw m3, m1, m6
18386 pmulhrsw m3, m7
18387 pmaddubsw m5, m4, m6
18388 pmulhrsw m5, m7
18389 packuswb m3, m5
18390 movu [r0 + 293 * 16], m3
18391
18392 ; mode 7 [row 24]
18393 movu m6, [r5 + 1 * 16]
18394 pmaddubsw m3, m0, m6
18395 pmulhrsw m3, m7
18396 pmaddubsw m5, m2, m6
18397 pmulhrsw m5, m7
18398 packuswb m3, m5
18399 movu [r0 + 368 * 16], m3
18400 pmaddubsw m3, m1, m6
18401 pmulhrsw m3, m7
18402 pmaddubsw m5, m4, m6
18403 pmulhrsw m5, m7
18404 packuswb m3, m5
18405 movu [r0 + 369 * 16], m3
18406
18407 ; mode 7 [row 26]
18408 movu m6, [r5 + 19 * 16]
18409 pmaddubsw m3, m0, m6
18410 pmulhrsw m3, m7
18411 pmaddubsw m5, m2, m6
18412 pmulhrsw m5, m7
18413 packuswb m3, m5
18414 movu [r0 + 372 * 16], m3
18415 pmaddubsw m3, m1, m6
18416 pmulhrsw m3, m7
18417 pmaddubsw m5, m4, m6
18418 pmulhrsw m5, m7
18419 packuswb m3, m5
18420 movu [r0 + 373 * 16], m3
18421
18422 ; mode 3 [row 9]
18423 movu m6, [r5 + 4 * 16]
18424 movu m0, [r4 + 9]
18425 movd m1, [r4 + 10]
18426 palignr m1, m0, 1
18427 punpcklbw m0, m1
18428 pmaddubsw m1, m0, m6
18429 pmulhrsw m1, m7
18430 movu m2, [r4 + 17]
18431 movd m3, [r4 + 18]
18432 palignr m3, m2, 1
18433 punpcklbw m2, m3
18434 pmaddubsw m3, m2, m6
18435 pmulhrsw m3, m7
18436 packuswb m1, m3
18437 movu [r0 + 82 * 16], m1
18438
18439 ; mode 6 [row 19 - first half]
18440 movu [r0 + 294 * 16], m1
18441
18442 movu m1, [r4 + 25]
18443 movd m3, [r4 + 26]
18444 palignr m3, m1, 1
18445 punpcklbw m1, m3
18446 pmaddubsw m3, m1, m6
18447 pmulhrsw m3, m7
18448 movu m4, [r4 + 33]
18449 movd m5, [r4 + 34]
18450 palignr m5, m4, 1
18451 punpcklbw m4, m5
18452 pmaddubsw m5, m4, m6
18453 pmulhrsw m5, m7
18454 packuswb m3, m5
18455 movu [r0 + 83 * 16], m3
18456
18457 ; mode 6 [row 19 - second half]
18458 movu [r0 + 295 * 16], m3
18459
18460 ; mode 4 [row 12]
18461 movu m6, [r5 + 17 * 16]
18462 pmaddubsw m3, m0, m6
18463 pmulhrsw m3, m7
18464 pmaddubsw m5, m2, m6
18465 pmulhrsw m5, m7
18466 packuswb m3, m5
18467 movu [r0 + 152 * 16], m3
18468
18469 ; mode 4 [row 12 - first half]
18470 movu [r0 + 296 * 16], m3
18471
18472 pmaddubsw m3, m1, m6
18473 pmulhrsw m3, m7
18474 pmaddubsw m5, m4, m6
18475 pmulhrsw m5, m7
18476 packuswb m3, m5
18477 movu [r0 + 153 * 16], m3
18478
18479 ; mode 4 [row 12 - second half]
18480 movu [r0 + 297 * 16], m3
18481
18482 ; mode 3 [row 10]
18483 movu m6, [r5 + 30 * 16]
18484 pmaddubsw m3, m0, m6
18485 pmulhrsw m3, m7
18486 pmaddubsw m5, m2, m6
18487 pmulhrsw m5, m7
18488 packuswb m3, m5
18489 movu [r0 + 84 * 16], m3
18490
18491 ; mode 6 [row 21 - first half]
18492 movu [r0 + 298 * 16], m3
18493
18494 pmaddubsw m3, m1, m6
18495 pmulhrsw m3, m7
18496 pmaddubsw m5, m4, m6
18497 pmulhrsw m5, m7
18498 packuswb m3, m5
18499 movu [r0 + 85 * 16], m3
18500
18501 ; mode 6 [row 21 - second half]
18502 movu [r0 + 299 * 16], m3
18503
18504 ; mode 5 [row 15]
18505 movu m6, [r5 + 16 * 16]
18506 pmaddubsw m3, m0, m6
18507 pmulhrsw m3, m7
18508 pmaddubsw m5, m2, m6
18509 pmulhrsw m5, m7
18510 packuswb m3, m5
18511 movu [r0 + 222 * 16], m3
18512 pmaddubsw m3, m1, m6
18513 pmulhrsw m3, m7
18514 pmaddubsw m5, m4, m6
18515 pmulhrsw m5, m7
18516 packuswb m3, m5
18517 movu [r0 + 223 * 16], m3
18518
18519 ; mode 7 [row 28]
18520 movu m6, [r5 + 5 * 16]
18521 pmaddubsw m3, m0, m6
18522 pmulhrsw m3, m7
18523 pmaddubsw m5, m2, m6
18524 pmulhrsw m5, m7
18525 packuswb m3, m5
18526 movu [r0 + 376 * 16], m3
18527 pmaddubsw m3, m1, m6
18528 pmulhrsw m3, m7
18529 pmaddubsw m5, m4, m6
18530 pmulhrsw m5, m7
18531 packuswb m3, m5
18532 movu [r0 + 377 * 16], m3
18533
18534 ; mode 7 [row 29]
18535 movu m6, [r5 + 14 * 16]
18536 pmaddubsw m3, m0, m6
18537 pmulhrsw m3, m7
18538 pmaddubsw m5, m2, m6
18539 pmulhrsw m5, m7
18540 packuswb m3, m5
18541 movu [r0 + 378 * 16], m3
18542 pmaddubsw m3, m1, m6
18543 pmulhrsw m3, m7
18544 pmaddubsw m5, m4, m6
18545 pmulhrsw m5, m7
18546 packuswb m3, m5
18547 movu [r0 + 379 * 16], m3
18548
18549 ; mode 7 [row 30]
18550 movu m6, [r5 + 23 * 16]
18551 pmaddubsw m3, m0, m6
18552 pmulhrsw m3, m7
18553 pmaddubsw m5, m2, m6
18554 pmulhrsw m5, m7
18555 packuswb m3, m5
18556 movu [r0 + 380 * 16], m3
18557 pmaddubsw m3, m1, m6
18558 pmulhrsw m3, m7
18559 pmaddubsw m5, m4, m6
18560 pmulhrsw m5, m7
18561 packuswb m3, m5
18562 movu [r0 + 381 * 16], m3
18563
18564 ; mode 3 [row 11]
18565 movu m6, [r5 + 24 * 16]
18566 movu m0, [r4 + 10]
18567 movd m1, [r4 + 11]
18568 palignr m1, m0, 1
18569 punpcklbw m0, m1
18570 pmaddubsw m1, m0, m6
18571 pmulhrsw m1, m7
18572 movu m2, [r4 + 18]
18573 movd m3, [r4 + 19]
18574 palignr m3, m2, 1
18575 punpcklbw m2, m3
18576 pmaddubsw m3, m2, m6
18577 pmulhrsw m3, m7
18578 packuswb m1, m3
18579 movu [r0 + 86 * 16], m1
18580
18581 ; mode 6 [row 23 - first half]
18582 movu [r0 + 302 * 16], m1
18583
18584 movu m1, [r4 + 26]
18585 movd m3, [r4 + 27]
18586 palignr m3, m1, 1
18587 punpcklbw m1, m3
18588 pmaddubsw m3, m1, m6
18589 pmulhrsw m3, m7
18590 movu m4, [r4 + 34]
18591 movd m5, [r4 + 35]
18592 palignr m5, m4, 1
18593 punpcklbw m4, m5
18594 pmaddubsw m5, m4, m6
18595 pmulhrsw m5, m7
18596 packuswb m3, m5
18597 movu [r0 + 87 * 16], m3
18598
18599 ; mode 6 [row 23 - second half]
18600 movu [r0 + 303 * 16], m3
18601
18602 ; mode 4 [row 13]
18603 movu m6, [r5 + 6 * 16]
18604 pmaddubsw m3, m0, m6
18605 pmulhrsw m3, m7
18606 pmaddubsw m5, m2, m6
18607 pmulhrsw m5, m7
18608 packuswb m3, m5
18609 movu [r0 + 154 * 16], m3
18610 pmaddubsw m3, m1, m6
18611 pmulhrsw m3, m7
18612 pmaddubsw m5, m4, m6
18613 pmulhrsw m5, m7
18614 packuswb m3, m5
18615 movu [r0 + 155 * 16], m3
18616
18617 ; mode 4 [row 14]
18618 movu m6, [r5 + 27 * 16]
18619 pmaddubsw m3, m0, m6
18620 pmulhrsw m3, m7
18621 pmaddubsw m5, m2, m6
18622 pmulhrsw m5, m7
18623 packuswb m3, m5
18624 movu [r0 + 156 * 16], m3
18625 pmaddubsw m3, m1, m6
18626 pmulhrsw m3, m7
18627 pmaddubsw m5, m4, m6
18628 pmulhrsw m5, m7
18629 packuswb m3, m5
18630 movu [r0 + 157 * 16], m3
18631
18632 ; mode 5 [row 16]
18633 movu m6, [r5 + 1 * 16]
18634 pmaddubsw m3, m0, m6
18635 pmulhrsw m3, m7
18636 pmaddubsw m5, m2, m6
18637 pmulhrsw m5, m7
18638 packuswb m3, m5
18639 movu [r0 + 224 * 16], m3
18640 pmaddubsw m3, m1, m6
18641 pmulhrsw m3, m7
18642 pmaddubsw m5, m4, m6
18643 pmulhrsw m5, m7
18644 packuswb m3, m5
18645 movu [r0 + 225 * 16], m3
18646
18647 ; mode 5 [row 17]
18648 movu m6, [r5 + 18 * 16]
18649 pmaddubsw m3, m0, m6
18650 pmulhrsw m3, m7
18651 pmaddubsw m5, m2, m6
18652 pmulhrsw m5, m7
18653 packuswb m3, m5
18654 movu [r0 + 226 * 16], m3
18655 pmaddubsw m3, m1, m6
18656 pmulhrsw m3, m7
18657 pmaddubsw m5, m4, m6
18658 pmulhrsw m5, m7
18659 packuswb m3, m5
18660 movu [r0 + 227 * 16], m3
18661
18662 ; mode 6 [row 22]
18663 movu m6, [r5 + 11 * 16]
18664 pmaddubsw m3, m0, m6
18665 pmulhrsw m3, m7
18666 pmaddubsw m5, m2, m6
18667 pmulhrsw m5, m7
18668 packuswb m3, m5
18669 movu [r0 + 300 * 16], m3
18670 pmaddubsw m3, m1, m6
18671 pmulhrsw m3, m7
18672 pmaddubsw m5, m4, m6
18673 pmulhrsw m5, m7
18674 packuswb m3, m5
18675 movu [r0 + 301 * 16], m3
18676
18677 ; mode 3 [row 12]
18678 movu m6, [r5 + 18 * 16]
18679 movu m0, [r4 + 11]
18680 movd m1, [r4 + 12]
18681 palignr m1, m0, 1
18682 punpcklbw m0, m1
18683 pmaddubsw m1, m0, m6
18684 pmulhrsw m1, m7
18685 movu m2, [r4 + 19]
18686 movd m3, [r4 + 20]
18687 palignr m3, m2, 1
18688 punpcklbw m2, m3
18689 pmaddubsw m3, m2, m6
18690 pmulhrsw m3, m7
18691 packuswb m1, m3
18692 movu [r0 + 88 * 16], m1
18693
18694 ; mode 6 [row 25 - first half]
18695 movu [r0 + 306 * 16], m1
18696
18697 movu m1, [r4 + 27]
18698 movd m3, [r4 + 28]
18699 palignr m3, m1, 1
18700 punpcklbw m1, m3
18701 pmaddubsw m3, m1, m6
18702 pmulhrsw m3, m7
18703 movu m4, [r4 + 35]
18704 movd m5, [r4 + 36]
18705 palignr m5, m4, 1
18706 punpcklbw m4, m5
18707 pmaddubsw m5, m4, m6
18708 pmulhrsw m5, m7
18709 packuswb m3, m5
18710 movu [r0 + 89 * 16], m3
18711
18712 ; mode 6 [row 25 - second half]
18713 movu [r0 + 307 * 16], m3
18714
18715 ; mode 4 [row 15]
18716 movu m6, [r5 + 16 * 16]
18717 pmaddubsw m3, m0, m6
18718 pmulhrsw m3, m7
18719 pmaddubsw m5, m2, m6
18720 pmulhrsw m5, m7
18721 packuswb m3, m5
18722 movu [r0 + 158 * 16], m3
18723 pmaddubsw m3, m1, m6
18724 pmulhrsw m3, m7
18725 pmaddubsw m5, m4, m6
18726 pmulhrsw m5, m7
18727 packuswb m3, m5
18728 movu [r0 + 159 * 16], m3
18729
18730 ; mode 5 [row 18]
18731 movu m6, [r5 + 3 * 16]
18732 pmaddubsw m3, m0, m6
18733 pmulhrsw m3, m7
18734 pmaddubsw m5, m2, m6
18735 pmulhrsw m5, m7
18736 packuswb m3, m5
18737 movu [r0 + 228 * 16], m3
18738 pmaddubsw m3, m1, m6
18739 pmulhrsw m3, m7
18740 pmaddubsw m5, m4, m6
18741 pmulhrsw m5, m7
18742 packuswb m3, m5
18743 movu [r0 + 229 * 16], m3
18744
18745 ; mode 5 [row 19]
18746 movu m6, [r5 + 20 * 16]
18747 pmaddubsw m3, m0, m6
18748 pmulhrsw m3, m7
18749 pmaddubsw m5, m2, m6
18750 pmulhrsw m5, m7
18751 packuswb m3, m5
18752 movu [r0 + 230 * 16], m3
18753 pmaddubsw m3, m1, m6
18754 pmulhrsw m3, m7
18755 pmaddubsw m5, m4, m6
18756 pmulhrsw m5, m7
18757 packuswb m3, m5
18758 movu [r0 + 231 * 16], m3
18759
18760 ; mode 6 [row 24]
18761 movu m6, [r5 + 5 * 16]
18762 pmaddubsw m3, m0, m6
18763 pmulhrsw m3, m7
18764 pmaddubsw m5, m2, m6
18765 pmulhrsw m5, m7
18766 packuswb m3, m5
18767 movu [r0 + 304 * 16], m3
18768 pmaddubsw m3, m1, m6
18769 pmulhrsw m3, m7
18770 pmaddubsw m5, m4, m6
18771 pmulhrsw m5, m7
18772 packuswb m3, m5
18773 movu [r0 + 305 * 16], m3
18774
18775 ; mode 6 [row 26]
18776 movu m6, [r5 + 31 * 16]
18777 pmaddubsw m3, m0, m6
18778 pmulhrsw m3, m7
18779 pmaddubsw m5, m2, m6
18780 pmulhrsw m5, m7
18781 packuswb m3, m5
18782 movu [r0 + 308 * 16], m3
18783 pmaddubsw m3, m1, m6
18784 pmulhrsw m3, m7
18785 pmaddubsw m5, m4, m6
18786 pmulhrsw m5, m7
18787 packuswb m3, m5
18788 movu [r0 + 309 * 16], m3
18789
18790 ; mode 3 [row 13]
18791 movu m6, [r5 + 12 * 16]
18792 movu m0, [r4 + 12]
18793 movd m1, [r4 + 13]
18794 palignr m1, m0, 1
18795 punpcklbw m0, m1
18796 pmaddubsw m1, m0, m6
18797 pmulhrsw m1, m7
18798 movu m2, [r4 + 20]
18799 movd m3, [r4 + 21]
18800 palignr m3, m2, 1
18801 punpcklbw m2, m3
18802 pmaddubsw m3, m2, m6
18803 pmulhrsw m3, m7
18804 packuswb m1, m3
18805 movu [r0 + 90 * 16], m1
18806
18807 movu m1, [r4 + 28]
18808 movd m3, [r4 + 29]
18809 palignr m3, m1, 1
18810 punpcklbw m1, m3
18811 pmaddubsw m3, m1, m6
18812 pmulhrsw m3, m7
18813 movu m4, [r4 + 36]
18814 movd m5, [r4 + 37]
18815 palignr m5, m4, 1
18816 punpcklbw m4, m5
18817 pmaddubsw m5, m4, m6
18818 pmulhrsw m5, m7
18819 packuswb m3, m5
18820 movu [r0 + 91 * 16], m3
18821
18822 ; mode 4 [row 16]
18823 movu m6, [r5 + 5 * 16]
18824 pmaddubsw m3, m0, m6
18825 pmulhrsw m3, m7
18826 pmaddubsw m5, m2, m6
18827 pmulhrsw m5, m7
18828 packuswb m3, m5
18829 movu [r0 + 160 * 16], m3
18830
18831 ; mode 5 [row 20 - first half]
18832 movu [r0 + 232 * 16], m3
18833
18834 pmaddubsw m3, m1, m6
18835 pmulhrsw m3, m7
18836 pmaddubsw m5, m4, m6
18837 pmulhrsw m5, m7
18838 packuswb m3, m5
18839 movu [r0 + 161 * 16], m3
18840
18841 ; mode 5 [row 20 - second half]
18842 movu [r0 + 233 * 16], m3
18843
18844 ; mode 4 [row 17]
18845 movu m6, [r5 + 26 * 16]
18846 pmaddubsw m3, m0, m6
18847 pmulhrsw m3, m7
18848 pmaddubsw m5, m2, m6
18849 pmulhrsw m5, m7
18850 packuswb m3, m5
18851 movu [r0 + 162 * 16], m3
18852 pmaddubsw m3, m1, m6
18853 pmulhrsw m3, m7
18854 pmaddubsw m5, m4, m6
18855 pmulhrsw m5, m7
18856 packuswb m3, m5
18857 movu [r0 + 163 * 16], m3
18858
18859 ; mode 5 [row 21]
18860 movu m6, [r5 + 22 * 16]
18861 pmaddubsw m3, m0, m6
18862 pmulhrsw m3, m7
18863 pmaddubsw m5, m2, m6
18864 pmulhrsw m5, m7
18865 packuswb m3, m5
18866 movu [r0 + 234 * 16], m3
18867 pmaddubsw m3, m1, m6
18868 pmulhrsw m3, m7
18869 pmaddubsw m5, m4, m6
18870 pmulhrsw m5, m7
18871 packuswb m3, m5
18872 movu [r0 + 235 * 16], m3
18873
18874 ; mode 6 [row 27]
18875 movu m6, [r5 + 12 * 16]
18876 pmaddubsw m3, m0, m6
18877 pmulhrsw m3, m7
18878 pmaddubsw m5, m2, m6
18879 pmulhrsw m5, m7
18880 packuswb m3, m5
18881 movu [r0 + 310 * 16], m3
18882 pmaddubsw m3, m1, m6
18883 pmulhrsw m3, m7
18884 pmaddubsw m5, m4, m6
18885 pmulhrsw m5, m7
18886 packuswb m3, m5
18887 movu [r0 + 311 * 16], m3
18888
18889 ; mode 6 [row 28]
18890 movu m6, [r5 + 25 * 16]
18891 pmaddubsw m3, m0, m6
18892 pmulhrsw m3, m7
18893 pmaddubsw m5, m2, m6
18894 pmulhrsw m5, m7
18895 packuswb m3, m5
18896 movu [r0 + 312 * 16], m3
18897 pmaddubsw m3, m1, m6
18898 pmulhrsw m3, m7
18899 pmaddubsw m5, m4, m6
18900 pmulhrsw m5, m7
18901 packuswb m3, m5
18902 movu [r0 + 313 * 16], m3
18903
18904 ; mode 3 [row 14]
18905 movu m6, [r5 + 6 * 16]
18906 movu m0, [r4 + 13]
18907 movd m1, [r4 + 14]
18908 palignr m1, m0, 1
18909 punpcklbw m0, m1
18910 pmaddubsw m1, m0, m6
18911 pmulhrsw m1, m7
18912 movu m2, [r4 + 21]
18913 movd m3, [r4 + 22]
18914 palignr m3, m2, 1
18915 punpcklbw m2, m3
18916 pmaddubsw m3, m2, m6
18917 pmulhrsw m3, m7
18918 packuswb m1, m3
18919 movu [r0 + 92 * 16], m1
18920
18921 ; mode 6 [row 29 - first half]
18922 movu [r0 + 314 * 16], m1
18923
18924 movu m1, [r4 + 29]
18925 movd m3, [r4 + 30]
18926 palignr m3, m1, 1
18927 punpcklbw m1, m3
18928 pmaddubsw m3, m1, m6
18929 pmulhrsw m3, m7
18930 movu m4, [r4 + 37]
18931 movd m5, [r4 + 38]
18932 palignr m5, m4, 1
18933 punpcklbw m4, m5
18934 pmaddubsw m5, m4, m6
18935 pmulhrsw m5, m7
18936 packuswb m3, m5
18937 movu [r0 + 93 * 16], m3
18938
18939 ; mode 6 [row 29 - second half]
18940 movu [r0 + 315 * 16], m3
18941
18942 ; mode 4 [row 18]
18943 movu m6, [r5 + 15 * 16]
18944 pmaddubsw m3, m0, m6
18945 pmulhrsw m3, m7
18946 pmaddubsw m5, m2, m6
18947 pmulhrsw m5, m7
18948 packuswb m3, m5
18949 movu [r0 + 164 * 16], m3
18950 pmaddubsw m3, m1, m6
18951 pmulhrsw m3, m7
18952 pmaddubsw m5, m4, m6
18953 pmulhrsw m5, m7
18954 packuswb m3, m5
18955 movu [r0 + 165 * 16], m3
18956
18957 ; mode 5 [row 22]
18958 movu m6, [r5 + 7 * 16]
18959 pmaddubsw m3, m0, m6
18960 pmulhrsw m3, m7
18961 pmaddubsw m5, m2, m6
18962 pmulhrsw m5, m7
18963 packuswb m3, m5
18964 movu [r0 + 236 * 16], m3
18965 pmaddubsw m3, m1, m6
18966 pmulhrsw m3, m7
18967 pmaddubsw m5, m4, m6
18968 pmulhrsw m5, m7
18969 packuswb m3, m5
18970 movu [r0 + 237 * 16], m3
18971
18972 ; mode 5 [row 23]
18973 movu m6, [r5 + 24 * 16]
18974 pmaddubsw m3, m0, m6
18975 pmulhrsw m3, m7
18976 pmaddubsw m5, m2, m6
18977 pmulhrsw m5, m7
18978 packuswb m3, m5
18979 movu [r0 + 238 * 16], m3
18980 pmaddubsw m3, m1, m6
18981 pmulhrsw m3, m7
18982 pmaddubsw m5, m4, m6
18983 pmulhrsw m5, m7
18984 packuswb m3, m5
18985 movu [r0 + 239 * 16], m3
18986
18987 ; mode 6 [row 30]
18988 movu m6, [r5 + 19 * 16]
18989 pmaddubsw m3, m0, m6
18990 pmulhrsw m3, m7
18991 pmaddubsw m5, m2, m6
18992 pmulhrsw m5, m7
18993 packuswb m3, m5
18994 movu [r0 + 316 * 16], m3
18995 pmaddubsw m3, m1, m6
18996 pmulhrsw m3, m7
18997 pmaddubsw m5, m4, m6
18998 pmulhrsw m5, m7
18999 packuswb m3, m5
19000 movu [r0 + 317 * 16], m3
19001
19002 ; mode 3 [row 16]
19003 movu m6, [r5 + 26 * 16]
19004 movu m0, [r4 + 14]
19005 movd m1, [r4 + 15]
19006 palignr m1, m0, 1
19007 punpcklbw m0, m1
19008 pmaddubsw m1, m0, m6
19009 pmulhrsw m1, m7
19010 movu m2, [r4 + 22]
19011 movd m3, [r4 + 23]
19012 palignr m3, m2, 1
19013 punpcklbw m2, m3
19014 pmaddubsw m3, m2, m6
19015 pmulhrsw m3, m7
19016 packuswb m1, m3
19017 movu [r0 + 96 * 16], m1
19018
19019 ; mode 5 [row 25 - first half]
19020 movu [r0 + 242 * 16], m1
19021
19022 movu m1, [r4 + 30]
19023 movd m3, [r4 + 31]
19024 palignr m3, m1, 1
19025 punpcklbw m1, m3
19026 pmaddubsw m3, m1, m6
19027 pmulhrsw m3, m7
19028 movu m4, [r4 + 38]
19029 movd m5, [r4 + 39]
19030 palignr m5, m4, 1
19031 punpcklbw m4, m5
19032 pmaddubsw m5, m4, m6
19033 pmulhrsw m5, m7
19034 packuswb m3, m5
19035 movu [r0 + 97 * 16], m3
19036
19037 ; mode 5 [row 25 - second half]
19038 movu [r0 + 243 * 16], m3
19039
19040 ; mode 4 [row 19]
19041 movu m6, [r5 + 4 * 16]
19042 pmaddubsw m3, m0, m6
19043 pmulhrsw m3, m7
19044 pmaddubsw m5, m2, m6
19045 pmulhrsw m5, m7
19046 packuswb m3, m5
19047 movu [r0 + 166 * 16], m3
19048 pmaddubsw m3, m1, m6
19049 pmulhrsw m3, m7
19050 pmaddubsw m5, m4, m6
19051 pmulhrsw m5, m7
19052 packuswb m3, m5
19053 movu [r0 + 167 * 16], m3
19054
19055 ; mode 4 [row 20]
19056 movu m6, [r5 + 25 * 16]
19057 pmaddubsw m3, m0, m6
19058 pmulhrsw m3, m7
19059 pmaddubsw m5, m2, m6
19060 pmulhrsw m5, m7
19061 packuswb m3, m5
19062 movu [r0 + 168 * 16], m3
19063 pmaddubsw m3, m1, m6
19064 pmulhrsw m3, m7
19065 pmaddubsw m5, m4, m6
19066 pmulhrsw m5, m7
19067 packuswb m3, m5
19068 movu [r0 + 169 * 16], m3
19069
19070 ; mode 5 [row 24]
19071 movu m6, [r5 + 9 * 16]
19072 pmaddubsw m3, m0, m6
19073 pmulhrsw m3, m7
19074 pmaddubsw m5, m2, m6
19075 pmulhrsw m5, m7
19076 packuswb m3, m5
19077 movu [r0 + 240 * 16], m3
19078 pmaddubsw m3, m1, m6
19079 pmulhrsw m3, m7
19080 pmaddubsw m5, m4, m6
19081 pmulhrsw m5, m7
19082 packuswb m3, m5
19083 movu [r0 + 241 * 16], m3
19084
19085 ; mode 3 [row 17]
19086 movu m6, [r5 + 20 * 16]
19087 movu m0, [r4 + 15]
19088 movd m1, [r4 + 16]
19089 palignr m1, m0, 1
19090 punpcklbw m0, m1
19091 pmaddubsw m1, m0, m6
19092 pmulhrsw m1, m7
19093 movu m2, [r4 + 23]
19094 movd m3, [r4 + 24]
19095 palignr m3, m2, 1
19096 punpcklbw m2, m3
19097 pmaddubsw m3, m2, m6
19098 pmulhrsw m3, m7
19099 packuswb m1, m3
19100 movu [r0 + 98 * 16], m1
19101
19102 movu m1, [r4 + 31]
19103 movd m3, [r4 + 32]
19104 palignr m3, m1, 1
19105 punpcklbw m1, m3
19106 pmaddubsw m3, m1, m6
19107 pmulhrsw m3, m7
19108 movu m4, [r4 + 39]
19109 movd m5, [r4 + 40]
19110 palignr m5, m4, 1
19111 punpcklbw m4, m5
19112 pmaddubsw m5, m4, m6
19113 pmulhrsw m5, m7
19114 packuswb m3, m5
19115 movu [r0 + 99 * 16], m3
19116
19117 ; mode 4 [row 21]
19118 movu m6, [r5 + 14 * 16]
19119 pmaddubsw m3, m0, m6
19120 pmulhrsw m3, m7
19121 pmaddubsw m5, m2, m6
19122 pmulhrsw m5, m7
19123 packuswb m3, m5
19124 movu [r0 + 170 * 16], m3
19125 pmaddubsw m3, m1, m6
19126 pmulhrsw m3, m7
19127 pmaddubsw m5, m4, m6
19128 pmulhrsw m5, m7
19129 packuswb m3, m5
19130 movu [r0 + 171 * 16], m3
19131
19132 ; mode 5 [row 26]
19133 movu m6, [r5 + 11 * 16]
19134 pmaddubsw m3, m0, m6
19135 pmulhrsw m3, m7
19136 pmaddubsw m5, m2, m6
19137 pmulhrsw m5, m7
19138 packuswb m3, m5
19139 movu [r0 + 244 * 16], m3
19140 pmaddubsw m3, m1, m6
19141 pmulhrsw m3, m7
19142 pmaddubsw m5, m4, m6
19143 pmulhrsw m5, m7
19144 packuswb m3, m5
19145 movu [r0 + 245 * 16], m3
19146
19147 ; mode 5 [row 27]
19148 movu m6, [r5 + 28 * 16]
19149 pmaddubsw m3, m0, m6
19150 pmulhrsw m3, m7
19151 pmaddubsw m5, m2, m6
19152 pmulhrsw m5, m7
19153 packuswb m3, m5
19154 movu [r0 + 246 * 16], m3
19155 pmaddubsw m3, m1, m6
19156 pmulhrsw m3, m7
19157 pmaddubsw m5, m4, m6
19158 pmulhrsw m5, m7
19159 packuswb m3, m5
19160 movu [r0 + 247 * 16], m3
19161
19162 ; mode 3 [row 18]
19163 movu m6, [r5 + 14 * 16]
19164 movu m0, [r4 + 16]
19165 movd m1, [r4 + 17]
19166 palignr m1, m0, 1
19167 punpcklbw m0, m1
19168 pmaddubsw m1, m0, m6
19169 pmulhrsw m1, m7
19170 movu m2, [r4 + 24]
19171 movd m3, [r4 + 25]
19172 palignr m3, m2, 1
19173 punpcklbw m2, m3
19174 pmaddubsw m3, m2, m6
19175 pmulhrsw m3, m7
19176 packuswb m1, m3
19177 movu [r0 + 100 * 16], m1
19178
19179 movu m1, [r4 + 32]
19180 movd m3, [r4 + 33]
19181 palignr m3, m1, 1
19182 punpcklbw m1, m3
19183 pmaddubsw m3, m1, m6
19184 pmulhrsw m3, m7
19185 movu m4, [r4 + 40]
19186 movd m5, [r4 + 41]
19187 palignr m5, m4, 1
19188 punpcklbw m4, m5
19189 pmaddubsw m5, m4, m6
19190 pmulhrsw m5, m7
19191 packuswb m3, m5
19192 movu [r0 + 101 * 16], m3
19193
19194 ; mode 4 [row 22]
19195 movu m6, [r5 + 3 * 16]
19196 pmaddubsw m3, m0, m6
19197 pmulhrsw m3, m7
19198 pmaddubsw m5, m2, m6
19199 pmulhrsw m5, m7
19200 packuswb m3, m5
19201 movu [r0 + 172 * 16], m3
19202 pmaddubsw m3, m1, m6
19203 pmulhrsw m3, m7
19204 pmaddubsw m5, m4, m6
19205 pmulhrsw m5, m7
19206 packuswb m3, m5
19207 movu [r0 + 173 * 16], m3
19208
19209 ; mode 4 [row 23]
19210 movu m6, [r5 + 24 * 16]
19211 pmaddubsw m3, m0, m6
19212 pmulhrsw m3, m7
19213 pmaddubsw m5, m2, m6
19214 pmulhrsw m5, m7
19215 packuswb m3, m5
19216 movu [r0 + 174 * 16], m3
19217 pmaddubsw m3, m1, m6
19218 pmulhrsw m3, m7
19219 pmaddubsw m5, m4, m6
19220 pmulhrsw m5, m7
19221 packuswb m3, m5
19222 movu [r0 + 175 * 16], m3
19223
19224 ; mode 5 [row 28]
19225 movu m6, [r5 + 13 * 16]
19226 pmaddubsw m3, m0, m6
19227 pmulhrsw m3, m7
19228 pmaddubsw m5, m2, m6
19229 pmulhrsw m5, m7
19230 packuswb m3, m5
19231 movu [r0 + 248 * 16], m3
19232 pmaddubsw m3, m1, m6
19233 pmulhrsw m3, m7
19234 pmaddubsw m5, m4, m6
19235 pmulhrsw m5, m7
19236 packuswb m3, m5
19237 movu [r0 + 249 * 16], m3
19238
19239 ; mode 5 [row 29]
19240 movu m6, [r5 + 30 * 16]
19241 pmaddubsw m3, m0, m6
19242 pmulhrsw m3, m7
19243 pmaddubsw m5, m2, m6
19244 pmulhrsw m5, m7
19245 packuswb m3, m5
19246 movu [r0 + 250 * 16], m3
19247 pmaddubsw m3, m1, m6
19248 pmulhrsw m3, m7
19249 pmaddubsw m5, m4, m6
19250 pmulhrsw m5, m7
19251 packuswb m3, m5
19252 movu [r0 + 251 * 16], m3
19253
19254 ; mode 3 [row 19]
19255 movu m6, [r5 + 8 * 16]
19256 movu m0, [r4 + 17]
19257 movd m1, [r4 + 18]
19258 palignr m1, m0, 1
19259 punpcklbw m0, m1
19260 pmaddubsw m1, m0, m6
19261 pmulhrsw m1, m7
19262 movu m2, [r4 + 25]
19263 movd m3, [r4 + 26]
19264 palignr m3, m2, 1
19265 punpcklbw m2, m3
19266 pmaddubsw m3, m2, m6
19267 pmulhrsw m3, m7
19268 packuswb m1, m3
19269 movu [r0 + 102 * 16], m1
19270
19271 movu m1, [r4 + 33]
19272 movd m3, [r4 + 34]
19273 palignr m3, m1, 1
19274 punpcklbw m1, m3
19275 pmaddubsw m3, m1, m6
19276 pmulhrsw m3, m7
19277 movu m4, [r4 + 41]
19278 movd m5, [r4 + 42]
19279 palignr m5, m4, 1
19280 punpcklbw m4, m5
19281 pmaddubsw m5, m4, m6
19282 pmulhrsw m5, m7
19283 packuswb m3, m5
19284 movu [r0 + 103 * 16], m3
19285
19286 ; mode 4 [row 24]
19287 movu m6, [r5 + 13 * 16]
19288 pmaddubsw m3, m0, m6
19289 pmulhrsw m3, m7
19290 pmaddubsw m5, m2, m6
19291 pmulhrsw m5, m7
19292 packuswb m3, m5
19293 movu [r0 + 176 * 16], m3
19294 pmaddubsw m3, m1, m6
19295 pmulhrsw m3, m7
19296 pmaddubsw m5, m4, m6
19297 pmulhrsw m5, m7
19298 packuswb m3, m5
19299 movu [r0 + 177 * 16], m3
19300
19301 ; mode 5 [row 30]
19302 movu m6, [r5 + 15 * 16]
19303 pmaddubsw m3, m0, m6
19304 pmulhrsw m3, m7
19305 pmaddubsw m5, m2, m6
19306 pmulhrsw m5, m7
19307 packuswb m3, m5
19308 movu [r0 + 252 * 16], m3
19309 pmaddubsw m3, m1, m6
19310 pmulhrsw m3, m7
19311 pmaddubsw m5, m4, m6
19312 pmulhrsw m5, m7
19313 packuswb m3, m5
19314 movu [r0 + 253 * 16], m3
19315
19316 ; mode 3 [row 20]
19317 movu m6, [r5 + 2 * 16]
19318 movu m0, [r4 + 18]
19319 movd m1, [r4 + 19]
19320 palignr m1, m0, 1
19321 punpcklbw m0, m1
19322 pmaddubsw m1, m0, m6
19323 pmulhrsw m1, m7
19324 movu m2, [r4 + 26]
19325 movd m3, [r4 + 27]
19326 palignr m3, m2, 1
19327 punpcklbw m2, m3
19328 pmaddubsw m3, m2, m6
19329 pmulhrsw m3, m7
19330 packuswb m1, m3
19331 movu [r0 + 104 * 16], m1
19332
19333 movu m1, [r4 + 34]
19334 movd m3, [r4 + 35]
19335 palignr m3, m1, 1
19336 punpcklbw m1, m3
19337 pmaddubsw m3, m1, m6
19338 pmulhrsw m3, m7
19339 movu m4, [r4 + 42]
19340 movd m5, [r4 + 43]
19341 palignr m5, m4, 1
19342 punpcklbw m4, m5
19343 pmaddubsw m5, m4, m6
19344 pmulhrsw m5, m7
19345 packuswb m3, m5
19346 movu [r0 + 105 * 16], m3
19347
19348 ; mode 4 [row 25]
19349 pmaddubsw m3, m0, m6
19350 pmulhrsw m3, m7
19351 pmaddubsw m5, m2, m6
19352 pmulhrsw m5, m7
19353 packuswb m3, m5
19354 movu [r0 + 178 * 16], m3
19355 pmaddubsw m3, m1, m6
19356 pmulhrsw m3, m7
19357 pmaddubsw m5, m4, m6
19358 pmulhrsw m5, m7
19359 packuswb m3, m5
19360 movu [r0 + 179 * 16], m3
19361
19362 ; mode 4 [row 26]
19363 movu m6, [r5 + 23 * 16]
19364 pmaddubsw m3, m0, m6
19365 pmulhrsw m3, m7
19366 pmaddubsw m5, m2, m6
19367 pmulhrsw m5, m7
19368 packuswb m3, m5
19369 movu [r0 + 180 * 16], m3
19370 pmaddubsw m3, m1, m6
19371 pmulhrsw m3, m7
19372 pmaddubsw m5, m4, m6
19373 pmulhrsw m5, m7
19374 packuswb m3, m5
19375 movu [r0 + 181 * 16], m3
19376
19377 ; mode 3 [row 21]
19378 movu m6, [r5 + 28 * 16]
19379 pmaddubsw m3, m0, m6
19380 pmulhrsw m3, m7
19381 pmaddubsw m5, m2, m6
19382 pmulhrsw m5, m7
19383 packuswb m3, m5
19384 movu [r0 + 106 * 16], m3
19385 pmaddubsw m3, m1, m6
19386 pmulhrsw m3, m7
19387 pmaddubsw m5, m4, m6
19388 pmulhrsw m5, m7
19389 packuswb m3, m5
19390 movu [r0 + 107 * 16], m3
19391
19392 ; mode 3 [row 22]
19393 movu m6, [r5 + 22 * 16]
19394 movu m0, [r4 + 19]
19395 movd m1, [r4 + 20]
19396 palignr m1, m0, 1
19397 punpcklbw m0, m1
19398 pmaddubsw m1, m0, m6
19399 pmulhrsw m1, m7
19400 movu m2, [r4 + 27]
19401 movd m3, [r4 + 28]
19402 palignr m3, m2, 1
19403 punpcklbw m2, m3
19404 pmaddubsw m3, m2, m6
19405 pmulhrsw m3, m7
19406 packuswb m1, m3
19407 movu [r0 + 108 * 16], m1
19408
19409 movu m1, [r4 + 35]
19410 movd m3, [r4 + 36]
19411 palignr m3, m1, 1
19412 punpcklbw m1, m3
19413 pmaddubsw m3, m1, m6
19414 pmulhrsw m3, m7
19415 movu m4, [r4 + 43]
19416 movd m5, [r4 + 44]
19417 palignr m5, m4, 1
19418 punpcklbw m4, m5
19419 pmaddubsw m5, m4, m6
19420 pmulhrsw m5, m7
19421 packuswb m3, m5
19422 movu [r0 + 109 * 16], m3
19423
19424 ; mode 4 [row 27]
19425 movu m6, [r5 + 12 * 16]
19426 pmaddubsw m3, m0, m6
19427 pmulhrsw m3, m7
19428 pmaddubsw m5, m2, m6
19429 pmulhrsw m5, m7
19430 packuswb m3, m5
19431 movu [r0 + 182 * 16], m3
19432 pmaddubsw m3, m1, m6
19433 pmulhrsw m3, m7
19434 pmaddubsw m5, m4, m6
19435 pmulhrsw m5, m7
19436 packuswb m3, m5
19437 movu [r0 + 183 * 16], m3
19438
19439 ; mode 3 [row 23]
19440 movu m6, [r5 + 16 * 16]
19441 movu m0, [r4 + 20]
19442 movd m1, [r4 + 21]
19443 palignr m1, m0, 1
19444 punpcklbw m0, m1
19445 pmaddubsw m1, m0, m6
19446 pmulhrsw m1, m7
19447 movu m2, [r4 + 28]
19448 movd m3, [r4 + 29]
19449 palignr m3, m2, 1
19450 punpcklbw m2, m3
19451 pmaddubsw m3, m2, m6
19452 pmulhrsw m3, m7
19453 packuswb m1, m3
19454 movu [r0 + 110 * 16], m1
19455
19456 movu m1, [r4 + 36]
19457 movd m3, [r4 + 37]
19458 palignr m3, m1, 1
19459 punpcklbw m1, m3
19460 pmaddubsw m3, m1, m6
19461 pmulhrsw m3, m7
19462 movu m4, [r4 + 44]
19463 movd m5, [r4 + 45]
19464 palignr m5, m4, 1
19465 punpcklbw m4, m5
19466 pmaddubsw m5, m4, m6
19467 pmulhrsw m5, m7
19468 packuswb m3, m5
19469 movu [r0 + 111 * 16], m3
19470
19471 ; mode 4 [row 28]
19472 movu m6, [r5 + 1 * 16]
19473 pmaddubsw m3, m0, m6
19474 pmulhrsw m3, m7
19475 pmaddubsw m5, m2, m6
19476 pmulhrsw m5, m7
19477 packuswb m3, m5
19478 movu [r0 + 184 * 16], m3
19479 pmaddubsw m3, m1, m6
19480 pmulhrsw m3, m7
19481 pmaddubsw m5, m4, m6
19482 pmulhrsw m5, m7
19483 packuswb m3, m5
19484 movu [r0 + 185 * 16], m3
19485
19486 ; mode 4 [row 29]
19487 movu m6, [r5 + 22 * 16]
19488 pmaddubsw m3, m0, m6
19489 pmulhrsw m3, m7
19490 pmaddubsw m5, m2, m6
19491 pmulhrsw m5, m7
19492 packuswb m3, m5
19493 movu [r0 + 186 * 16], m3
19494 pmaddubsw m3, m1, m6
19495 pmulhrsw m3, m7
19496 pmaddubsw m5, m4, m6
19497 pmulhrsw m5, m7
19498 packuswb m3, m5
19499 movu [r0 + 187 * 16], m3
19500
19501 ; mode 3 [row 24]
19502 movu m6, [r5 + 10 * 16]
19503 movu m0, [r4 + 21]
19504 movd m1, [r4 + 22]
19505 palignr m1, m0, 1
19506 punpcklbw m0, m1
19507 pmaddubsw m1, m0, m6
19508 pmulhrsw m1, m7
19509 movu m2, [r4 + 29]
19510 movd m3, [r4 + 30]
19511 palignr m3, m2, 1
19512 punpcklbw m2, m3
19513 pmaddubsw m3, m2, m6
19514 pmulhrsw m3, m7
19515 packuswb m1, m3
19516 movu [r0 + 112 * 16], m1
19517
19518 movu m1, [r4 + 37]
19519 movd m3, [r4 + 38]
19520 palignr m3, m1, 1
19521 punpcklbw m1, m3
19522 pmaddubsw m3, m1, m6
19523 pmulhrsw m3, m7
19524 movu m4, [r4 + 45]
19525 movd m5, [r4 + 46]
19526 palignr m5, m4, 1
19527 punpcklbw m4, m5
19528 pmaddubsw m5, m4, m6
19529 pmulhrsw m5, m7
19530 packuswb m3, m5
19531 movu [r0 + 113 * 16], m3
19532
19533 ; mode 4 [row 30]
19534 movu m6, [r5 + 11 * 16]
19535 pmaddubsw m3, m0, m6
19536 pmulhrsw m3, m7
19537 pmaddubsw m5, m2, m6
19538 pmulhrsw m5, m7
19539 packuswb m3, m5
19540 movu [r0 + 188 * 16], m3
19541 pmaddubsw m3, m1, m6
19542 pmulhrsw m3, m7
19543 pmaddubsw m5, m4, m6
19544 pmulhrsw m5, m7
19545 packuswb m3, m5
19546 movu [r0 + 189 * 16], m3
19547
19548 ; mode 3 [row 25]
19549 movu m6, [r5 + 4 * 16]
19550 movu m0, [r4 + 22]
19551 movd m1, [r4 + 23]
19552 palignr m1, m0, 1
19553 punpcklbw m0, m1
19554 pmaddubsw m1, m0, m6
19555 pmulhrsw m1, m7
19556 movu m2, [r4 + 30]
19557 movd m3, [r4 + 31]
19558 palignr m3, m2, 1
19559 punpcklbw m2, m3
19560 pmaddubsw m3, m2, m6
19561 pmulhrsw m3, m7
19562 packuswb m1, m3
19563 movu [r0 + 114 * 16], m1
19564
19565 movu m1, [r4 + 38]
19566 movd m3, [r4 + 39]
19567 palignr m3, m1, 1
19568 punpcklbw m1, m3
19569 pmaddubsw m3, m1, m6
19570 pmulhrsw m3, m7
19571 movu m4, [r4 + 46]
19572 movd m5, [r4 + 47]
19573 palignr m5, m4, 1
19574 punpcklbw m4, m5
19575 pmaddubsw m5, m4, m6
19576 pmulhrsw m5, m7
19577 packuswb m3, m5
19578 movu [r0 + 115 * 16], m3
19579
19580 ; mode 3 [row 26]
19581 movu m6, [r5 + 30 * 16]
19582 pmaddubsw m3, m0, m6
19583 pmulhrsw m3, m7
19584 pmaddubsw m5, m2, m6
19585 pmulhrsw m5, m7
19586 packuswb m3, m5
19587 movu [r0 + 116 * 16], m3
19588 pmaddubsw m3, m1, m6
19589 pmulhrsw m3, m7
19590 pmaddubsw m5, m4, m6
19591 pmulhrsw m5, m7
19592 packuswb m3, m5
19593 movu [r0 + 117 * 16], m3
19594
19595 ; mode 3 [row 27]
19596 movu m6, [r5 + 24 * 16]
19597 movu m0, [r4 + 23]
19598 movd m1, [r4 + 24]
19599 palignr m1, m0, 1
19600 punpcklbw m0, m1
19601 pmaddubsw m1, m0, m6
19602 pmulhrsw m1, m7
19603 movu m2, [r4 + 31]
19604 movd m3, [r4 + 32]
19605 palignr m3, m2, 1
19606 punpcklbw m2, m3
19607 pmaddubsw m3, m2, m6
19608 pmulhrsw m3, m7
19609 packuswb m1, m3
19610 movu [r0 + 118 * 16], m1
19611
19612 movu m1, [r4 + 39]
19613 movd m3, [r4 + 40]
19614 palignr m3, m1, 1
19615 punpcklbw m1, m3
19616 pmaddubsw m3, m1, m6
19617 pmulhrsw m3, m7
19618 movu m4, [r4 + 47]
19619 movd m5, [r4 + 48]
19620 palignr m5, m4, 1
19621 punpcklbw m4, m5
19622 pmaddubsw m5, m4, m6
19623 pmulhrsw m5, m7
19624 packuswb m3, m5
19625 movu [r0 + 119 * 16], m3
19626
19627 ; mode 3 [row 28]
19628 movu m6, [r5 + 18 * 16]
19629 movu m0, [r4 + 24]
19630 movd m1, [r4 + 25]
19631 palignr m1, m0, 1
19632 punpcklbw m0, m1
19633 pmaddubsw m1, m0, m6
19634 pmulhrsw m1, m7
19635 movu m2, [r4 + 32]
19636 movd m3, [r4 + 33]
19637 palignr m3, m2, 1
19638 punpcklbw m2, m3
19639 pmaddubsw m3, m2, m6
19640 pmulhrsw m3, m7
19641 packuswb m1, m3
19642 movu [r0 + 120 * 16], m1
19643
19644 movu m1, [r4 + 40]
19645 movd m3, [r4 + 41]
19646 palignr m3, m1, 1
19647 punpcklbw m1, m3
19648 pmaddubsw m3, m1, m6
19649 pmulhrsw m3, m7
19650 movu m4, [r4 + 48]
19651 movd m5, [r4 + 49]
19652 palignr m5, m4, 1
19653 punpcklbw m4, m5
19654 pmaddubsw m5, m4, m6
19655 pmulhrsw m5, m7
19656 packuswb m3, m5
19657 movu [r0 + 121 * 16], m3
19658
19659 ; mode 3 [row 29]
19660 movu m6, [r5 + 12 * 16]
19661 movu m0, [r4 + 25]
19662 movd m1, [r4 + 26]
19663 palignr m1, m0, 1
19664 punpcklbw m0, m1
19665 pmaddubsw m1, m0, m6
19666 pmulhrsw m1, m7
19667 movu m2, [r4 + 33]
19668 movd m3, [r4 + 34]
19669 palignr m3, m2, 1
19670 punpcklbw m2, m3
19671 pmaddubsw m3, m2, m6
19672 pmulhrsw m3, m7
19673 packuswb m1, m3
19674 movu [r0 + 122 * 16], m1
19675
19676 movu m1, [r4 + 41]
19677 movd m3, [r4 + 42]
19678 palignr m3, m1, 1
19679 punpcklbw m1, m3
19680 pmaddubsw m3, m1, m6
19681 pmulhrsw m3, m7
19682 movu m4, [r4 + 49]
19683 movd m5, [r4 + 50]
19684 palignr m5, m4, 1
19685 punpcklbw m4, m5
19686 pmaddubsw m5, m4, m6
19687 pmulhrsw m5, m7
19688 packuswb m3, m5
19689 movu [r0 + 123 * 16], m3
19690
19691 ; mode 3 [row 30]
19692 movu m6, [r5 + 6 * 16]
19693 movu m0, [r4 + 26]
19694 movd m1, [r4 + 27]
19695 palignr m1, m0, 1
19696 punpcklbw m0, m1
19697 pmaddubsw m1, m0, m6
19698 pmulhrsw m1, m7
19699 movu m2, [r4 + 34]
19700 movd m3, [r4 + 35]
19701 palignr m3, m2, 1
19702 punpcklbw m2, m3
19703 pmaddubsw m3, m2, m6
19704 pmulhrsw m3, m7
19705 packuswb m1, m3
19706 movu [r0 + 124 * 16], m1
19707
19708 movu m1, [r4 + 42]
19709 movd m3, [r4 + 43]
19710 palignr m3, m1, 1
19711 punpcklbw m1, m3
19712 pmaddubsw m3, m1, m6
19713 pmulhrsw m3, m7
19714 movu m4, [r4 + 50]
19715 movd m5, [r4 + 51]
19716 palignr m5, m4, 1
19717 punpcklbw m4, m5
19718 pmaddubsw m5, m4, m6
19719 pmulhrsw m5, m7
19720 packuswb m3, m5
19721 movu [r0 + 125 * 16], m3
19722
19723 ; mode 10
19724 movu m1, [r2 + 1]
19725 movu m2, [r2 + 17]
19726 movu [r0 + 512 * 16], m1
19727 movu [r0 + 513 * 16], m2
19728 movu [r0 + 514 * 16], m1
19729 movu [r0 + 515 * 16], m2
19730 movu [r0 + 516 * 16], m1
19731 movu [r0 + 517 * 16], m2
19732 movu [r0 + 518 * 16], m1
19733 movu [r0 + 519 * 16], m2
19734 movu [r0 + 520 * 16], m1
19735 movu [r0 + 521 * 16], m2
19736 movu [r0 + 522 * 16], m1
19737 movu [r0 + 523 * 16], m2
19738 movu [r0 + 524 * 16], m1
19739 movu [r0 + 525 * 16], m2
19740 movu [r0 + 526 * 16], m1
19741 movu [r0 + 527 * 16], m2
19742
19743 movu [r0 + 528 * 16], m1
19744 movu [r0 + 529 * 16], m2
19745 movu [r0 + 530 * 16], m1
19746 movu [r0 + 531 * 16], m2
19747 movu [r0 + 532 * 16], m1
19748 movu [r0 + 533 * 16], m2
19749 movu [r0 + 534 * 16], m1
19750 movu [r0 + 535 * 16], m2
19751 movu [r0 + 536 * 16], m1
19752 movu [r0 + 537 * 16], m2
19753 movu [r0 + 538 * 16], m1
19754 movu [r0 + 539 * 16], m2
19755 movu [r0 + 540 * 16], m1
19756 movu [r0 + 541 * 16], m2
19757 movu [r0 + 542 * 16], m1
19758 movu [r0 + 543 * 16], m2
19759
19760 movu [r0 + 544 * 16], m1
19761 movu [r0 + 545 * 16], m2
19762 movu [r0 + 546 * 16], m1
19763 movu [r0 + 547 * 16], m2
19764 movu [r0 + 548 * 16], m1
19765 movu [r0 + 549 * 16], m2
19766 movu [r0 + 550 * 16], m1
19767 movu [r0 + 551 * 16], m2
19768 movu [r0 + 552 * 16], m1
19769 movu [r0 + 553 * 16], m2
19770 movu [r0 + 554 * 16], m1
19771 movu [r0 + 555 * 16], m2
19772 movu [r0 + 556 * 16], m1
19773 movu [r0 + 557 * 16], m2
19774 movu [r0 + 558 * 16], m1
19775 movu [r0 + 559 * 16], m2
19776
19777 movu [r0 + 560 * 16], m1
19778 movu [r0 + 561 * 16], m2
19779 movu [r0 + 562 * 16], m1
19780 movu [r0 + 563 * 16], m2
19781 movu [r0 + 564 * 16], m1
19782 movu [r0 + 565 * 16], m2
19783 movu [r0 + 566 * 16], m1
19784 movu [r0 + 567 * 16], m2
19785 movu [r0 + 568 * 16], m1
19786 movu [r0 + 569 * 16], m2
19787 movu [r0 + 570 * 16], m1
19788 movu [r0 + 571 * 16], m2
19789 movu [r0 + 572 * 16], m1
19790 movu [r0 + 573 * 16], m2
19791 movu [r0 + 574 * 16], m1
19792 movu [r0 + 575 * 16], m2
19793
19794 ; mode 11 [row 0]
19795 movu m0, [r4]
19796
19797 ; mode 11 [row 15 - first half]
19798 movu [r0 + 606 * 16], m0
19799
19800 movu [r0 + 606 * 16], m0
19801
19802 ; mode 12 [row 31]
19803 pslldq m6, m0, 4
19804 pinsrb m6, [r3 + 26], 0
19805 pinsrb m6, [r3 + 19], 1
19806 pinsrb m6, [r3 + 13], 2
19807 pinsrb m6, [r3 + 6], 3
19808 movu [r0 + 702 * 16], m6
19809 movu m6, [r4 + 12]
19810 movu [r0 + 703 * 16], m6
19811
19812 ; mode 11 [row 31]
19813 pslldq m6, m0, 1
19814 pinsrb m6, [r3 + 16], 0
19815 movu [r0 + 638 * 16], m6
19816 movu m6, [r4 + 15]
19817 movu [r0 + 639 * 16], m6
19818
19819 movd m1, [r4 + 1]
19820 palignr m1, m0, 1
19821 punpcklbw m0, m1
19822 pmaddubsw m1, m0, [r5 + 30 * 16]
19823 pmulhrsw m1, m7
19824 movu m2, [r4 + 8]
19825 movd m3, [r4 + 9]
19826 palignr m3, m2, 1
19827 punpcklbw m2, m3
19828 pmaddubsw m3, m2, [r5 + 30 * 16]
19829 pmulhrsw m3, m7
19830 packuswb m1, m3
19831 movu [r0 + 576 * 16], m1
19832
19833 movu m1, [r4 + 16]
19834
19835 ; mode 11 [row 15 - second half]
19836 movu [r0 + 607 * 16], m1
19837
19838 movd m3, [r4 + 17]
19839 palignr m3, m1, 1
19840 punpcklbw m1, m3
19841 pmaddubsw m3, m1, [r5 + 30 * 16]
19842 pmulhrsw m3, m7
19843 movu m4, [r4 + 24]
19844 movd m5, [r4 + 25]
19845 palignr m5, m4, 1
19846 punpcklbw m4, m5
19847 pmaddubsw m5, m4, [r5 + 30 * 16]
19848 pmulhrsw m5, m7
19849 packuswb m3, m5
19850 movu [r0 + 577 * 16], m3
19851
19852 ; mode 11 [row 1]
19853 pmaddubsw m3, m0, [r5 + 28 * 16]
19854 pmulhrsw m3, m7
19855 pmaddubsw m5, m2, [r5 + 28 * 16]
19856 pmulhrsw m5, m7
19857 packuswb m3, m5
19858 movu [r0 + 578 * 16], m3
19859 pmaddubsw m3, m1, [r5 + 28 * 16]
19860 pmulhrsw m3, m7
19861 pmaddubsw m5, m4, [r5 + 28 * 16]
19862 pmulhrsw m5, m7
19863 packuswb m3, m5
19864 movu [r0 + 579 * 16], m3
19865
19866 ; mode 11 [row 2]
19867 pmaddubsw m3, m0, [r5 + 26 * 16]
19868 pmulhrsw m3, m7
19869 pmaddubsw m5, m2, [r5 + 26 * 16]
19870 pmulhrsw m5, m7
19871 packuswb m3, m5
19872 movu [r0 + 580 * 16], m3
19873 pmaddubsw m3, m1, [r5 + 26 * 16]
19874 pmulhrsw m3, m7
19875 pmaddubsw m5, m4, [r5 + 26 * 16]
19876 pmulhrsw m5, m7
19877 packuswb m3, m5
19878 movu [r0 + 581 * 16], m3
19879
19880 ; mode 11 [row 3]
19881 pmaddubsw m3, m0, [r5 + 24 * 16]
19882 pmulhrsw m3, m7
19883 pmaddubsw m5, m2, [r5 + 24 * 16]
19884 pmulhrsw m5, m7
19885 packuswb m3, m5
19886 movu [r0 + 582 * 16], m3
19887 pmaddubsw m3, m1, [r5 + 24 * 16]
19888 pmulhrsw m3, m7
19889 pmaddubsw m5, m4, [r5 + 24 * 16]
19890 pmulhrsw m5, m7
19891 packuswb m3, m5
19892 movu [r0 + 583 * 16], m3
19893
19894 ; mode 11 [row 4]
19895 pmaddubsw m3, m0, [r5 + 22 * 16]
19896 pmulhrsw m3, m7
19897 pmaddubsw m5, m2, [r5 + 22 * 16]
19898 pmulhrsw m5, m7
19899 packuswb m3, m5
19900 movu [r0 + 584 * 16], m3
19901
19902 ; mode 12 [row 1 - first half]
19903 movu [r0 + 642 * 16], m3
19904
19905 pmaddubsw m3, m1, [r5 + 22 * 16]
19906 pmulhrsw m3, m7
19907 pmaddubsw m5, m4, [r5 + 22 * 16]
19908 pmulhrsw m5, m7
19909 packuswb m3, m5
19910 movu [r0 + 585 * 16], m3
19911
19912 ; mode 12 [row 1 - second half]
19913 movu [r0 + 643 * 16], m3
19914
19915 ; mode 11 [row 5]
19916 pmaddubsw m3, m0, [r5 + 20 * 16]
19917 pmulhrsw m3, m7
19918 pmaddubsw m5, m2, [r5 + 20 * 16]
19919 pmulhrsw m5, m7
19920 packuswb m3, m5
19921 movu [r0 + 586 * 16], m3
19922 pmaddubsw m3, m1, [r5 + 20 * 16]
19923 pmulhrsw m3, m7
19924 pmaddubsw m5, m4, [r5 + 20 * 16]
19925 pmulhrsw m5, m7
19926 packuswb m3, m5
19927 movu [r0 + 587 * 16], m3
19928
19929 ; mode 11 [row 6]
19930 pmaddubsw m3, m0, [r5 + 18 * 16]
19931 pmulhrsw m3, m7
19932 pmaddubsw m5, m2, [r5 + 18 * 16]
19933 pmulhrsw m5, m7
19934 packuswb m3, m5
19935 movu [r0 + 588 * 16], m3
19936 pmaddubsw m3, m1, [r5 + 18 * 16]
19937 pmulhrsw m3, m7
19938 pmaddubsw m5, m4, [r5 + 18 * 16]
19939 pmulhrsw m5, m7
19940 packuswb m3, m5
19941 movu [r0 + 589 * 16], m3
19942
19943 ; mode 11 [row 7]
19944 pmaddubsw m3, m0, [r5 + 16 * 16]
19945 pmulhrsw m3, m7
19946 pmaddubsw m5, m2, [r5 + 16 * 16]
19947 pmulhrsw m5, m7
19948 packuswb m3, m5
19949 movu [r0 + 590 * 16], m3
19950 pmaddubsw m3, m1, [r5 + 16 * 16]
19951 pmulhrsw m3, m7
19952 pmaddubsw m5, m4, [r5 + 16 * 16]
19953 pmulhrsw m5, m7
19954 packuswb m3, m5
19955 movu [r0 + 591 * 16], m3
19956
19957 ; mode 11 [row 8]
19958 pmaddubsw m3, m0, [r5 + 14 * 16]
19959 pmulhrsw m3, m7
19960 pmaddubsw m5, m2, [r5 + 14 * 16]
19961 pmulhrsw m5, m7
19962 packuswb m3, m5
19963 movu [r0 + 592 * 16], m3
19964
19965 ; mode 13 [row 1 - first half]
19966 movu [r0 + 706 * 16], m3
19967
19968 pmaddubsw m3, m1, [r5 + 14 * 16]
19969 pmulhrsw m3, m7
19970 pmaddubsw m5, m4, [r5 + 14 * 16]
19971 pmulhrsw m5, m7
19972 packuswb m3, m5
19973 movu [r0 + 593 * 16], m3
19974
19975 ; mode 13 [row 1 - second half]
19976 movu [r0 + 707 * 16], m3
19977
19978 ; mode 11 [row 9]
19979 pmaddubsw m3, m0, [r5 + 12 * 16]
19980 pmulhrsw m3, m7
19981 pmaddubsw m5, m2, [r5 + 12 * 16]
19982 pmulhrsw m5, m7
19983 packuswb m3, m5
19984 movu [r0 + 594 * 16], m3
19985
19986 ; mode 12 [row 3 - first half]
19987 movu [r0 + 646 * 16], m3
19988
19989 pmaddubsw m3, m1, [r5 + 12 * 16]
19990 pmulhrsw m3, m7
19991 pmaddubsw m5, m4, [r5 + 12 * 16]
19992 pmulhrsw m5, m7
19993 packuswb m3, m5
19994 movu [r0 + 595 * 16], m3
19995
19996 ; mode 12 [row 3 - second half]
19997 movu [r0 + 647 * 16], m3
19998
19999 ; mode 11 [row 10]
20000 pmaddubsw m3, m0, [r5 + 10 * 16]
20001 pmulhrsw m3, m7
20002 pmaddubsw m5, m2, [r5 + 10 * 16]
20003 pmulhrsw m5, m7
20004 packuswb m3, m5
20005 movu [r0 + 596 * 16], m3
20006 pmaddubsw m3, m1, [r5 + 10 * 16]
20007 pmulhrsw m3, m7
20008 pmaddubsw m5, m4, [r5 + 10 * 16]
20009 pmulhrsw m5, m7
20010 packuswb m3, m5
20011 movu [r0 + 597 * 16], m3
20012
20013 ; mode 11 [row 11]
20014 pmaddubsw m3, m0, [r5 + 8 * 16]
20015 pmulhrsw m3, m7
20016 pmaddubsw m5, m2, [r5 + 8 * 16]
20017 pmulhrsw m5, m7
20018 packuswb m3, m5
20019 movu [r0 + 598 * 16], m3
20020 pmaddubsw m3, m1, [r5 + 8 * 16]
20021 pmulhrsw m3, m7
20022 pmaddubsw m5, m4, [r5 + 8 * 16]
20023 pmulhrsw m5, m7
20024 packuswb m3, m5
20025 movu [r0 + 599 * 16], m3
20026
20027 ; mode 11 [row 12]
20028 pmaddubsw m3, m0, [r5 + 6 * 16]
20029 pmulhrsw m3, m7
20030 pmaddubsw m5, m2, [r5 + 6 * 16]
20031 pmulhrsw m5, m7
20032 packuswb m3, m5
20033 movu [r0 + 600 * 16], m3
20034
20035 ; mode 14 [row 1 - first half]
20036 movu [r0 + 770 * 16], m3
20037
20038 pmaddubsw m3, m1, [r5 + 6 * 16]
20039 pmulhrsw m3, m7
20040 pmaddubsw m5, m4, [r5 + 6 * 16]
20041 pmulhrsw m5, m7
20042 packuswb m3, m5
20043 movu [r0 + 601 * 16], m3
20044
20045 ; mode 14 [row 1 - second half]
20046 movu [r0 + 771 * 16], m3
20047
20048 ; mode 11 [row 13]
20049 pmaddubsw m3, m0, [r5 + 4 * 16]
20050 pmulhrsw m3, m7
20051 pmaddubsw m5, m2, [r5 + 4 * 16]
20052 pmulhrsw m5, m7
20053 packuswb m3, m5
20054 movu [r0 + 602 * 16], m3
20055 pmaddubsw m3, m1, [r5 + 4 * 16]
20056 pmulhrsw m3, m7
20057 pmaddubsw m5, m4, [r5 + 4 * 16]
20058 pmulhrsw m5, m7
20059 packuswb m3, m5
20060 movu [r0 + 603 * 16], m3
20061
20062 ; mode 11 [row 14]
20063 pmaddubsw m3, m0, [r5 + 2 * 16]
20064 pmulhrsw m3, m7
20065 pmaddubsw m5, m2, [r5 + 2 * 16]
20066 pmulhrsw m5, m7
20067 packuswb m3, m5
20068 movu [r0 + 604 * 16], m3
20069
20070 ; mode 13 [row 5 - first half]
20071 movu [r0 + 650 * 16], m3
20072
20073 pmaddubsw m3, m1, [r5 + 2 * 16]
20074 pmulhrsw m3, m7
20075 pmaddubsw m5, m4, [r5 + 2 * 16]
20076 pmulhrsw m5, m7
20077 packuswb m3, m5
20078 movu [r0 + 605 * 16], m3
20079
20080 ; mode 13 [row 5 - second half]
20081 movu [r0 + 651 * 16], m3
20082
20083 ; mode 12 [row 0]
20084 pmaddubsw m3, m0, [r5 + 27 * 16]
20085 pmulhrsw m3, m7
20086 pmaddubsw m5, m2, [r5 + 27 * 16]
20087 pmulhrsw m5, m7
20088 packuswb m3, m5
20089 movu [r0 + 640 * 16], m3
20090 pmaddubsw m3, m1, [r5 + 27 * 16]
20091 pmulhrsw m3, m7
20092 pmaddubsw m5, m4, [r5 + 27 * 16]
20093 pmulhrsw m5, m7
20094 packuswb m3, m5
20095 movu [r0 + 641 * 16], m3
20096
20097 ; mode 12 [row 2]
20098 pmaddubsw m3, m0, [r5 + 17 * 16]
20099 pmulhrsw m3, m7
20100 pmaddubsw m5, m2, [r5 + 17 * 16]
20101 pmulhrsw m5, m7
20102 packuswb m3, m5
20103 movu [r0 + 644 * 16], m3
20104 pmaddubsw m3, m1, [r5 + 17 * 16]
20105 pmulhrsw m3, m7
20106 pmaddubsw m5, m4, [r5 + 17 * 16]
20107 pmulhrsw m5, m7
20108 packuswb m3, m5
20109 movu [r0 + 645 * 16], m3
20110
20111 ; mode 12 [row 4]
20112 pmaddubsw m3, m0, [r5 + 7 * 16]
20113 pmulhrsw m3, m7
20114 pmaddubsw m5, m2, [r5 + 7 * 16]
20115 pmulhrsw m5, m7
20116 packuswb m3, m5
20117 movu [r0 + 648 * 16], m3
20118 pmaddubsw m3, m1, [r5 + 7 * 16]
20119 pmulhrsw m3, m7
20120 pmaddubsw m5, m4, [r5 + 7 * 16]
20121 pmulhrsw m5, m7
20122 packuswb m3, m5
20123 movu [r0 + 649 * 16], m3
20124
20125 ; mode 13 [row 0]
20126 pmaddubsw m3, m0, [r5 + 23 * 16]
20127 pmulhrsw m3, m7
20128 pmaddubsw m5, m2, [r5 + 23 * 16]
20129 pmulhrsw m5, m7
20130 packuswb m3, m5
20131 movu [r0 + 704 * 16], m3
20132 pmaddubsw m3, m1, [r5 + 23 * 16]
20133 pmulhrsw m3, m7
20134 pmaddubsw m5, m4, [r5 + 23 * 16]
20135 pmulhrsw m5, m7
20136 packuswb m3, m5
20137 movu [r0 + 705 * 16], m3
20138
20139 ; mode 13 [row 2]
20140 pmaddubsw m3, m0, [r5 + 5 * 16]
20141 pmulhrsw m3, m7
20142 pmaddubsw m5, m2, [r5 + 5 * 16]
20143 pmulhrsw m5, m7
20144 packuswb m3, m5
20145 movu [r0 + 708 * 16], m3
20146 pmaddubsw m3, m1, [r5 + 5 * 16]
20147 pmulhrsw m3, m7
20148 pmaddubsw m5, m4, [r5 + 5 * 16]
20149 pmulhrsw m5, m7
20150 packuswb m3, m5
20151 movu [r0 + 709 * 16], m3
20152
20153 ; mode 14 [row 0]
20154 pmaddubsw m3, m0, [r5 + 19 * 16]
20155 pmulhrsw m3, m7
20156 pmaddubsw m5, m2, [r5 + 19 * 16]
20157 pmulhrsw m5, m7
20158 packuswb m3, m5
20159 movu [r0 + 768 * 16], m3
20160 pmaddubsw m3, m1, [r5 + 19 * 16]
20161 pmulhrsw m3, m7
20162 pmaddubsw m5, m4, [r5 + 19 * 16]
20163 pmulhrsw m5, m7
20164 packuswb m3, m5
20165 movu [r0 + 769 * 16], m3
20166
20167 ; mode 15 [row 0]
20168 pmaddubsw m3, m0, [r5 + 15 * 16]
20169 pmulhrsw m3, m7
20170 pmaddubsw m5, m2, [r5 + 15 * 16]
20171 pmulhrsw m5, m7
20172 packuswb m3, m5
20173 movu [r0 + 832 * 16], m3
20174 pmaddubsw m3, m1, [r5 + 15 * 16]
20175 pmulhrsw m3, m7
20176 pmaddubsw m5, m4, [r5 + 15 * 16]
20177 pmulhrsw m5, m7
20178 packuswb m3, m5
20179 movu [r0 + 833 * 16], m3
20180
20181 ; mode 11 [row 16]
20182 pslldq m0, 2
20183 pinsrb m0, [r4 + 0], 1
20184 pinsrb m0, [r3 + 16], 0
20185 pmaddubsw m3, m0, [r5 + 30 * 16]
20186 pmulhrsw m3, m7
20187 pslldq m2, 2
20188 pinsrb m2, [r4 + 8], 1
20189 pinsrb m2, [r4 + 7], 0
20190 pmaddubsw m5, m2, [r5 + 30 * 16]
20191 pmulhrsw m5, m7
20192 packuswb m3, m5
20193 movu [r0 + 608 * 16], m3
20194 pslldq m1, 2
20195 pinsrb m1, [r4 + 16], 1
20196 pinsrb m1, [r4 + 15], 0
20197 pmaddubsw m3, m1, [r5 + 30 * 16]
20198 pmulhrsw m3, m7
20199 pslldq m4, 2
20200 pinsrb m4, [r4 + 24], 1
20201 pinsrb m4, [r4 + 23], 0
20202 pmaddubsw m5, m4, [r5 + 30 * 16]
20203 pmulhrsw m5, m7
20204 packuswb m3, m5
20205 movu [r0 + 609 * 16], m3
20206
20207 ; mode 11 [row 17]
20208 pmaddubsw m3, m0, [r5 + 28 * 16]
20209 pmulhrsw m3, m7
20210 pmaddubsw m5, m2, [r5 + 28 * 16]
20211 pmulhrsw m5, m7
20212 packuswb m3, m5
20213 movu [r0 + 610 * 16], m3
20214 pmaddubsw m3, m1, [r5 + 28 * 16]
20215 pmulhrsw m3, m7
20216 pmaddubsw m5, m4, [r5 + 28 * 16]
20217 pmulhrsw m5, m7
20218 packuswb m3, m5
20219 movu [r0 + 611 * 16], m3
20220
20221 ; mode 11 [row 18]
20222 pmaddubsw m3, m0, [r5 + 26 * 16]
20223 pmulhrsw m3, m7
20224 pmaddubsw m5, m2, [r5 + 26 * 16]
20225 pmulhrsw m5, m7
20226 packuswb m3, m5
20227 movu [r0 + 612 * 16], m3
20228 pmaddubsw m3, m1, [r5 + 26 * 16]
20229 pmulhrsw m3, m7
20230 pmaddubsw m5, m4, [r5 + 26 * 16]
20231 pmulhrsw m5, m7
20232 packuswb m3, m5
20233 movu [r0 + 613 * 16], m3
20234
20235 ; mode 11 [row 19]
20236 pmaddubsw m3, m0, [r5 + 24 * 16]
20237 pmulhrsw m3, m7
20238 pmaddubsw m5, m2, [r5 + 24 * 16]
20239 pmulhrsw m5, m7
20240 packuswb m3, m5
20241 movu [r0 + 614 * 16], m3
20242 pmaddubsw m3, m1, [r5 + 24 * 16]
20243 pmulhrsw m3, m7
20244 pmaddubsw m5, m4, [r5 + 24 * 16]
20245 pmulhrsw m5, m7
20246 packuswb m3, m5
20247 movu [r0 + 615 * 16], m3
20248
20249 ; mode 11 [row 20]
20250 pmaddubsw m3, m0, [r5 + 22 * 16]
20251 pmulhrsw m3, m7
20252 pmaddubsw m5, m2, [r5 + 22 * 16]
20253 pmulhrsw m5, m7
20254 packuswb m3, m5
20255 movu [r0 + 616 * 16], m3
20256 pmaddubsw m3, m1, [r5 + 22 * 16]
20257 pmulhrsw m3, m7
20258 pmaddubsw m5, m4, [r5 + 22 * 16]
20259 pmulhrsw m5, m7
20260 packuswb m3, m5
20261 movu [r0 + 617 * 16], m3
20262
20263 ; mode 11 [row 21]
20264 pmaddubsw m3, m0, [r5 + 20 * 16]
20265 pmulhrsw m3, m7
20266 pmaddubsw m5, m2, [r5 + 20 * 16]
20267 pmulhrsw m5, m7
20268 packuswb m3, m5
20269 movu [r0 + 618 * 16], m3
20270 pmaddubsw m3, m1, [r5 + 20 * 16]
20271 pmulhrsw m3, m7
20272 pmaddubsw m5, m4, [r5 + 20 * 16]
20273 pmulhrsw m5, m7
20274 packuswb m3, m5
20275 movu [r0 + 619 * 16], m3
20276
20277 ; mode 11 [row 22]
20278 pmaddubsw m3, m0, [r5 + 18 * 16]
20279 pmulhrsw m3, m7
20280 pmaddubsw m5, m2, [r5 + 18 * 16]
20281 pmulhrsw m5, m7
20282 packuswb m3, m5
20283 movu [r0 + 620 * 16], m3
20284 pmaddubsw m3, m1, [r5 + 18 * 16]
20285 pmulhrsw m3, m7
20286 pmaddubsw m5, m4, [r5 + 18 * 16]
20287 pmulhrsw m5, m7
20288 packuswb m3, m5
20289 movu [r0 + 621 * 16], m3
20290
20291 ; mode 11 [row 23]
20292 pmaddubsw m3, m0, [r5 + 16 * 16]
20293 pmulhrsw m3, m7
20294 pmaddubsw m5, m2, [r5 + 16 * 16]
20295 pmulhrsw m5, m7
20296 packuswb m3, m5
20297 movu [r0 + 622 * 16], m3
20298 pmaddubsw m3, m1, [r5 + 16 * 16]
20299 pmulhrsw m3, m7
20300 pmaddubsw m5, m4, [r5 + 16 * 16]
20301 pmulhrsw m5, m7
20302 packuswb m3, m5
20303 movu [r0 + 623 * 16], m3
20304
20305 ; mode 11 [row 24]
20306 pmaddubsw m3, m0, [r5 + 14 * 16]
20307 pmulhrsw m3, m7
20308 pmaddubsw m5, m2, [r5 + 14 * 16]
20309 pmulhrsw m5, m7
20310 packuswb m3, m5
20311 movu [r0 + 624 * 16], m3
20312 pmaddubsw m3, m1, [r5 + 14 * 16]
20313 pmulhrsw m3, m7
20314 pmaddubsw m5, m4, [r5 + 14 * 16]
20315 pmulhrsw m5, m7
20316 packuswb m3, m5
20317 movu [r0 + 625 * 16], m3
20318
20319 ; mode 11 [row 25]
20320 pmaddubsw m3, m0, [r5 + 12 * 16]
20321 pmulhrsw m3, m7
20322 pmaddubsw m5, m2, [r5 + 12 * 16]
20323 pmulhrsw m5, m7
20324 packuswb m3, m5
20325 movu [r0 + 626 * 16], m3
20326 pmaddubsw m3, m1, [r5 + 12 * 16]
20327 pmulhrsw m3, m7
20328 pmaddubsw m5, m4, [r5 + 12 * 16]
20329 pmulhrsw m5, m7
20330 packuswb m3, m5
20331 movu [r0 + 627 * 16], m3
20332
20333 ; mode 11 [row 26]
20334 pmaddubsw m3, m0, [r5 + 10 * 16]
20335 pmulhrsw m3, m7
20336 pmaddubsw m5, m2, [r5 + 10 * 16]
20337 pmulhrsw m5, m7
20338 packuswb m3, m5
20339 movu [r0 + 628 * 16], m3
20340 pmaddubsw m3, m1, [r5 + 10 * 16]
20341 pmulhrsw m3, m7
20342 pmaddubsw m5, m4, [r5 + 10 * 16]
20343 pmulhrsw m5, m7
20344 packuswb m3, m5
20345 movu [r0 + 629 * 16], m3
20346
20347 ; mode 11 [row 27]
20348 pmaddubsw m3, m0, [r5 + 8 * 16]
20349 pmulhrsw m3, m7
20350 pmaddubsw m5, m2, [r5 + 8 * 16]
20351 pmulhrsw m5, m7
20352 packuswb m3, m5
20353 movu [r0 + 630 * 16], m3
20354 pmaddubsw m3, m1, [r5 + 8 * 16]
20355 pmulhrsw m3, m7
20356 pmaddubsw m5, m4, [r5 + 8 * 16]
20357 pmulhrsw m5, m7
20358 packuswb m3, m5
20359 movu [r0 + 631 * 16], m3
20360
20361 ; mode 11 [row 28]
20362 pmaddubsw m3, m0, [r5 + 6 * 16]
20363 pmulhrsw m3, m7
20364 pmaddubsw m5, m2, [r5 + 6 * 16]
20365 pmulhrsw m5, m7
20366 packuswb m3, m5
20367 movu [r0 + 632 * 16], m3
20368 pmaddubsw m3, m1, [r5 + 6 * 16]
20369 pmulhrsw m3, m7
20370 pmaddubsw m5, m4, [r5 + 6 * 16]
20371 pmulhrsw m5, m7
20372 packuswb m3, m5
20373 movu [r0 + 633 * 16], m3
20374
20375 ; mode 11 [row 29]
20376 pmaddubsw m3, m0, [r5 + 4 * 16]
20377 pmulhrsw m3, m7
20378 pmaddubsw m5, m2, [r5 + 4 * 16]
20379 pmulhrsw m5, m7
20380 packuswb m3, m5
20381 movu [r0 + 634 * 16], m3
20382 pmaddubsw m3, m1, [r5 + 4 * 16]
20383 pmulhrsw m3, m7
20384 pmaddubsw m5, m4, [r5 + 4 * 16]
20385 pmulhrsw m5, m7
20386 packuswb m3, m5
20387 movu [r0 + 635 * 16], m3
20388
20389 ; mode 11 [row 30]
20390 pmaddubsw m3, m0, [r5 + 2 * 16]
20391 pmulhrsw m3, m7
20392 pmaddubsw m5, m2, [r5 + 2 * 16]
20393 pmulhrsw m5, m7
20394 packuswb m3, m5
20395 movu [r0 + 636 * 16], m3
20396 pmaddubsw m3, m1, [r5 + 2 * 16]
20397 pmulhrsw m3, m7
20398 pmaddubsw m5, m4, [r5 + 2 * 16]
20399 pmulhrsw m5, m7
20400 packuswb m3, m5
20401 movu [r0 + 637 * 16], m3
20402
20403 ; mode 12 [row 6]
20404 pinsrb m0, [r3 + 6], 0
20405 pmaddubsw m3, m0, [r5 + 29 * 16]
20406 pmulhrsw m3, m7
20407 pmaddubsw m5, m2, [r5 + 29 * 16]
20408 pmulhrsw m5, m7
20409 packuswb m3, m5
20410 movu [r0 + 652 * 16], m3
20411 pmaddubsw m3, m1, [r5 + 29 * 16]
20412 pmulhrsw m3, m7
20413 pmaddubsw m5, m4, [r5 + 29 * 16]
20414 pmulhrsw m5, m7
20415 packuswb m3, m5
20416 movu [r0 + 653 * 16], m3
20417
20418 ; mode 12 [row 7]
20419 pmaddubsw m3, m0, [r5 + 24 * 16]
20420 pmulhrsw m3, m7
20421 pmaddubsw m5, m2, [r5 + 24 * 16]
20422 pmulhrsw m5, m7
20423 packuswb m3, m5
20424 movu [r0 + 654 * 16], m3
20425 pmaddubsw m3, m1, [r5 + 24 * 16]
20426 pmulhrsw m3, m7
20427 pmaddubsw m5, m4, [r5 + 24 * 16]
20428 pmulhrsw m5, m7
20429 packuswb m3, m5
20430 movu [r0 + 655 * 16], m3
20431
20432 ; mode 12 [row 8]
20433 pmaddubsw m3, m0, [r5 + 19 * 16]
20434 pmulhrsw m3, m7
20435 pmaddubsw m5, m2, [r5 + 19 * 16]
20436 pmulhrsw m5, m7
20437 packuswb m3, m5
20438 movu [r0 + 656 * 16], m3
20439 pmaddubsw m3, m1, [r5 + 19 * 16]
20440 pmulhrsw m3, m7
20441 pmaddubsw m5, m4, [r5 + 19 * 16]
20442 pmulhrsw m5, m7
20443 packuswb m3, m5
20444 movu [r0 + 657 * 16], m3
20445
20446 ; mode 12 [row 9]
20447 pmaddubsw m3, m0, [r5 + 14 * 16]
20448 pmulhrsw m3, m7
20449 pmaddubsw m5, m2, [r5 + 14 * 16]
20450 pmulhrsw m5, m7
20451 packuswb m3, m5
20452 movu [r0 + 658 * 16], m3
20453 pmaddubsw m3, m1, [r5 + 14 * 16]
20454 pmulhrsw m3, m7
20455 pmaddubsw m5, m4, [r5 + 14 * 16]
20456 pmulhrsw m5, m7
20457 packuswb m3, m5
20458 movu [r0 + 659 * 16], m3
20459
20460 ; mode 12 [row 10]
20461 pmaddubsw m3, m0, [r5 + 9 * 16]
20462 pmulhrsw m3, m7
20463 pmaddubsw m5, m2, [r5 + 9 * 16]
20464 pmulhrsw m5, m7
20465 packuswb m3, m5
20466 movu [r0 + 660 * 16], m3
20467 pmaddubsw m3, m1, [r5 + 9 * 16]
20468 pmulhrsw m3, m7
20469 pmaddubsw m5, m4, [r5 + 9 * 16]
20470 pmulhrsw m5, m7
20471 packuswb m3, m5
20472 movu [r0 + 661 * 16], m3
20473
20474 ; mode 12 [row 11]
20475 pmaddubsw m3, m0, [r5 + 4 * 16]
20476 pmulhrsw m3, m7
20477 pmaddubsw m5, m2, [r5 + 4 * 16]
20478 pmulhrsw m5, m7
20479 packuswb m3, m5
20480 movu [r0 + 662 * 16], m3
20481 pmaddubsw m3, m1, [r5 + 4 * 16]
20482 pmulhrsw m3, m7
20483 pmaddubsw m5, m4, [r5 + 4 * 16]
20484 pmulhrsw m5, m7
20485 packuswb m3, m5
20486 movu [r0 + 663 * 16], m3
20487
20488 ; mode 13 [row 3]
20489 movu m6, m0
20490 pinsrb m6, [r3 + 4], 0
20491 pmaddubsw m3, m6, [r5 + 28 * 16]
20492 pmulhrsw m3, m7
20493 pmaddubsw m5, m2, [r5 + 28 * 16]
20494 pmulhrsw m5, m7
20495 packuswb m3, m5
20496 movu [r0 + 710 * 16], m3
20497 pmaddubsw m3, m1, [r5 + 28 * 16]
20498 pmulhrsw m3, m7
20499 pmaddubsw m5, m4, [r5 + 28 * 16]
20500 pmulhrsw m5, m7
20501 packuswb m3, m5
20502 movu [r0 + 711 * 16], m3
20503
20504 ; mode 13 [row 4]
20505 pmaddubsw m3, m6, [r5 + 19 * 16]
20506 pmulhrsw m3, m7
20507 pmaddubsw m5, m2, [r5 + 19 * 16]
20508 pmulhrsw m5, m7
20509 packuswb m3, m5
20510 movu [r0 + 712 * 16], m3
20511 pmaddubsw m3, m1, [r5 + 19 * 16]
20512 pmulhrsw m3, m7
20513 pmaddubsw m5, m4, [r5 + 19 * 16]
20514 pmulhrsw m5, m7
20515 packuswb m3, m5
20516 movu [r0 + 713 * 16], m3
20517
20518 ; mode 13 [row 5]
20519 pmaddubsw m3, m6, [r5 + 10 * 16]
20520 pmulhrsw m3, m7
20521 pmaddubsw m5, m2, [r5 + 10 * 16]
20522 pmulhrsw m5, m7
20523 packuswb m3, m5
20524 movu [r0 + 714 * 16], m3
20525 pmaddubsw m3, m1, [r5 + 10 * 16]
20526 pmulhrsw m3, m7
20527 pmaddubsw m5, m4, [r5 + 10 * 16]
20528 pmulhrsw m5, m7
20529 packuswb m3, m5
20530 movu [r0 + 715 * 16], m3
20531
20532 ; mode 13 [row 6]
20533 pmaddubsw m3, m6, [r5 + 1 * 16]
20534 pmulhrsw m3, m7
20535 pmaddubsw m5, m2, [r5 + 1 * 16]
20536 pmulhrsw m5, m7
20537 packuswb m3, m5
20538 movu [r0 + 716 * 16], m3
20539 pmaddubsw m3, m1, [r5 + 1 * 16]
20540 pmulhrsw m3, m7
20541 pmaddubsw m5, m4, [r5 + 1 * 16]
20542 pmulhrsw m5, m7
20543 packuswb m3, m5
20544 movu [r0 + 717 * 16], m3
20545
20546 ; mode 14 [row 2]
20547 movu m6, m0
20548 pinsrb m6, [r4 + 0], 1
20549 pinsrb m6, [r3 + 2], 0
20550 pmaddubsw m3, m6, [r5 + 25 * 16]
20551 pmulhrsw m3, m7
20552 pmaddubsw m5, m2, [r5 + 25 * 16]
20553 pmulhrsw m5, m7
20554 packuswb m3, m5
20555 movu [r0 + 772 * 16], m3
20556 pmaddubsw m3, m1, [r5 + 25 * 16]
20557 pmulhrsw m3, m7
20558 pmaddubsw m5, m4, [r5 + 25 * 16]
20559 pmulhrsw m5, m7
20560 packuswb m3, m5
20561 movu [r0 + 773 * 16], m3
20562
20563 ; mode 14 [row 3]
20564 pmaddubsw m3, m6, [r5 + 12 * 16]
20565 pmulhrsw m3, m7
20566 pmaddubsw m5, m2, [r5 + 12 * 16]
20567 pmulhrsw m5, m7
20568 packuswb m3, m5
20569 movu [r0 + 774 * 16], m3
20570 pmaddubsw m3, m1, [r5 + 12 * 16]
20571 pmulhrsw m3, m7
20572 pmaddubsw m5, m4, [r5 + 12 * 16]
20573 pmulhrsw m5, m7
20574 packuswb m3, m5
20575 movu [r0 + 775 * 16], m3
20576
20577 ; mode 15 [row 1]
20578 pmaddubsw m3, m6, [r5 + 30 * 16]
20579 pmulhrsw m3, m7
20580 pmaddubsw m5, m2, [r5 + 30 * 16]
20581 pmulhrsw m5, m7
20582 packuswb m3, m5
20583 movu [r0 + 834 * 16], m3
20584 pmaddubsw m3, m1, [r5 + 30 * 16]
20585 pmulhrsw m3, m7
20586 pmaddubsw m5, m4, [r5 + 30 * 16]
20587 pmulhrsw m5, m7
20588 packuswb m3, m5
20589 movu [r0 + 835 * 16], m3
20590
20591 ; mode 15 [row 2]
20592 pmaddubsw m3, m6, [r5 + 13 * 16]
20593 pmulhrsw m3, m7
20594 pmaddubsw m5, m2, [r5 + 13 * 16]
20595 pmulhrsw m5, m7
20596 packuswb m3, m5
20597 movu [r0 + 836 * 16], m3
20598 pmaddubsw m3, m1, [r5 + 13 * 16]
20599 pmulhrsw m3, m7
20600 pmaddubsw m5, m4, [r5 + 13 * 16]
20601 pmulhrsw m5, m7
20602 packuswb m3, m5
20603 movu [r0 + 837 * 16], m3
20604
20605 ; mode 15 [row 3]
20606 pslldq m6, 2
20607 pinsrb m6, [r3 + 2], 1
20608 pinsrb m6, [r3 + 4], 0
20609 pmaddubsw m3, m6, [r5 + 28 * 16]
20610 pmulhrsw m3, m7
20611 pslldq m2, 2
20612 pinsrb m2, [r4 + 7], 1
20613 pinsrb m2, [r4 + 6], 0
20614 pmaddubsw m5, m2, [r5 + 28 * 16]
20615 pmulhrsw m5, m7
20616 packuswb m3, m5
20617 movu [r0 + 838 * 16], m3
20618 pslldq m1, 2
20619 pinsrb m1, [r4 + 15], 1
20620 pinsrb m1, [r4 + 14], 0
20621 pmaddubsw m3, m1, [r5 + 28 * 16]
20622 pmulhrsw m3, m7
20623 pslldq m4, 2
20624 pinsrb m4, [r4 + 23], 1
20625 pinsrb m4, [r4 + 22], 0
20626 pmaddubsw m5, m4, [r5 + 28 * 16]
20627 pmulhrsw m5, m7
20628 packuswb m3, m5
20629 movu [r0 + 839 * 16], m3
20630
20631 ; mode 15 [row 4]
20632 pmaddubsw m3, m6, [r5 + 11 * 16]
20633 pmulhrsw m3, m7
20634 pmaddubsw m5, m2, [r5 + 11 * 16]
20635 pmulhrsw m5, m7
20636 packuswb m3, m5
20637 movu [r0 + 840 * 16], m3
20638 pmaddubsw m3, m1, [r5 + 11 * 16]
20639 pmulhrsw m3, m7
20640 pmaddubsw m5, m4, [r5 + 11 * 16]
20641 pmulhrsw m5, m7
20642 packuswb m3, m5
20643 movu [r0 + 841 * 16], m3
20644
20645 ; mode 15 [row 5, 0-7]
20646 pslldq m6, 2
20647 pinsrb m6, [r3 + 4], 1
20648 pinsrb m6, [r3 + 6], 0
20649 pmaddubsw m3, m6, [r5 + 26 * 16]
20650 pmulhrsw m3, m7
20651 packuswb m3, m3
20652 movh [r0 + 842 * 16], m3
20653
20654 ; mode 15 [row 6, 0-7]
20655 pmaddubsw m3, m6, [r5 + 9 * 16]
20656 pmulhrsw m3, m7
20657 packuswb m3, m3
20658 movh [r0 + 844 * 16], m3
20659
20660 ; mode 15 [row 7, 0-7]
20661 pslldq m6, 2
20662 pinsrb m6, [r3 + 6], 1
20663 pinsrb m6, [r3 + 8], 0
20664 pmaddubsw m3, m6, [r5 + 24 * 16]
20665 pmulhrsw m3, m7
20666 packuswb m3, m3
20667 movh [r0 + 846 * 16], m3
20668
20669 ; mode 15 [row 8, 0-7]
20670 pmaddubsw m3, m6, [r5 + 7 * 16]
20671 pmulhrsw m3, m7
20672 packuswb m3, m3
20673 movh [r0 + 848 * 16], m3
20674
20675 ; mode 15 [row 9, 0-7]
20676 pslldq m6, 2
20677 pinsrb m6, [r3 + 8], 1
20678 pinsrb m6, [r3 + 9], 0
20679 pmaddubsw m3, m6, [r5 + 22 * 16]
20680 pmulhrsw m3, m7
20681 packuswb m3, m3
20682 movh [r0 + 850 * 16], m3
20683
20684 ; mode 15 [row 10, 0-7]
20685 pmaddubsw m3, m6, [r5 + 5 * 16]
20686 pmulhrsw m3, m7
20687 packuswb m3, m3
20688 movh [r0 + 852 * 16], m3
20689
20690 ; mode 15 [row 11, 0-7]
20691 pslldq m6, 2
20692 pinsrb m6, [r3 + 9], 1
20693 pinsrb m6, [r3 + 11], 0
20694 pmaddubsw m3, m6, [r5 + 20 * 16]
20695 pmulhrsw m3, m7
20696 packuswb m3, m3
20697 movh [r0 + 854 * 16], m3
20698
20699 ; mode 15 [row 12, 0-7]
20700 pmaddubsw m3, m6, [r5 + 3 * 16]
20701 pmulhrsw m3, m7
20702 packuswb m3, m3
20703 movh [r0 + 856 * 16], m3
20704
20705 ; mode 15 [row 13, 0-7]
20706 pslldq m6, 2
20707 pinsrb m6, [r3 + 11], 1
20708 pinsrb m6, [r3 + 13], 0
20709 pmaddubsw m3, m6, [r5 + 18 * 16]
20710 pmulhrsw m3, m7
20711 packuswb m3, m3
20712 movh [r0 + 858 * 16], m3
20713
20714 ; mode 15 [row 14, 0-7]
20715 pmaddubsw m3, m6, [r5 + 1 * 16]
20716 pmulhrsw m3, m7
20717 packuswb m3, m3
20718 movh [r0 + 860 * 16], m3
20719
20720 ; mode 15 [row 15, 0-7]
20721 pslldq m6, 2
20722 pinsrb m6, [r3 + 13], 1
20723 pinsrb m6, [r3 + 15], 0
20724 pmaddubsw m3, m6, [r5 + 16 * 16]
20725 pmulhrsw m3, m7
20726 packuswb m3, m3
20727 movh [r0 + 862 * 16], m3
20728
20729 ; mode 15 [row 16, 0-7]
20730 pslldq m6, 2
20731 pinsrb m6, [r3 + 15], 1
20732 pinsrb m6, [r3 + 17], 0
20733 pmaddubsw m3, m6, [r5 + 31 * 16]
20734 pmulhrsw m3, m7
20735 packuswb m3, m3
20736 movh [r0 + 864 * 16], m3
20737
20738 ; mode 15 [row 17, 0-7]
20739 pmaddubsw m3, m6, [r5 + 14 * 16]
20740 pmulhrsw m3, m7
20741 packuswb m3, m3
20742 movh [r0 + 866 * 16], m3
20743
20744 ; mode 15 [row 18, 0-7]
20745 pslldq m6, 2
20746 pinsrb m6, [r3 + 17], 1
20747 pinsrb m6, [r3 + 19], 0
20748 pmaddubsw m3, m6, [r5 + 29 * 16]
20749 pmulhrsw m3, m7
20750 packuswb m3, m3
20751 movh [r0 + 868 * 16], m3
20752
20753 ; mode 15 [row 19, 0-7]
20754 pmaddubsw m3, m6, [r5 + 12 * 16]
20755 pmulhrsw m3, m7
20756 packuswb m3, m3
20757 movh [r0 + 870 * 16], m3
20758
20759 ; mode 15 [row 20, 0-7]
20760 pslldq m6, 2
20761 pinsrb m6, [r3 + 19], 1
20762 pinsrb m6, [r3 + 21], 0
20763 pmaddubsw m3, m6, [r5 + 27 * 16]
20764 pmulhrsw m3, m7
20765 packuswb m3, m3
20766 movh [r0 + 872 * 16], m3
20767
20768 ; mode 15 [row 21, 0-7]
20769 pmaddubsw m3, m6, [r5 + 10 * 16]
20770 pmulhrsw m3, m7
20771 packuswb m3, m3
20772 movh [r0 + 874 * 16], m3
20773
20774 ; mode 15 [row 22, 0-7]
20775 pslldq m6, 2
20776 pinsrb m6, [r3 + 21], 1
20777 pinsrb m6, [r3 + 23], 0
20778 pmaddubsw m3, m6, [r5 + 25 * 16]
20779 pmulhrsw m3, m7
20780 packuswb m3, m3
20781 movh [r0 + 876 * 16], m3
20782
20783 ; mode 15 [row 23, 0-7]
20784 pmaddubsw m3, m6, [r5 + 8 * 16]
20785 pmulhrsw m3, m7
20786 packuswb m3, m3
20787 movh [r0 + 878 * 16], m3
20788
20789 ; mode 15 [row 24, 0-7]
20790 pslldq m6, 2
20791 pinsrb m6, [r3 + 23], 1
20792 pinsrb m6, [r3 + 24], 0
20793 pmaddubsw m3, m6, [r5 + 23 * 16]
20794 pmulhrsw m3, m7
20795 packuswb m3, m3
20796 movh [r0 + 880 * 16], m3
20797
20798 ; mode 15 [row 25, 0-7]
20799 pmaddubsw m3, m6, [r5 + 6 * 16]
20800 pmulhrsw m3, m7
20801 packuswb m3, m3
20802 movh [r0 + 882 * 16], m3
20803
20804 ; mode 15 [row 26, 0-7]
20805 pslldq m6, 2
20806 pinsrb m6, [r3 + 24], 1
20807 pinsrb m6, [r3 + 26], 0
20808 pmaddubsw m3, m6, [r5 + 21 * 16]
20809 pmulhrsw m3, m7
20810 packuswb m3, m3
20811 movh [r0 + 884 * 16], m3
20812
20813 ; mode 15 [row 27, 0-7]
20814 pmaddubsw m3, m6, [r5 + 4 * 16]
20815 pmulhrsw m3, m7
20816 packuswb m3, m3
20817 movh [r0 + 886 * 16], m3
20818
20819 ; mode 15 [row 28, 0-7]
20820 pslldq m6, 2
20821 pinsrb m6, [r3 + 26], 1
20822 pinsrb m6, [r3 + 28], 0
20823 pmaddubsw m3, m6, [r5 + 19 * 16]
20824 pmulhrsw m3, m7
20825 packuswb m3, m3
20826 movh [r0 + 888 * 16], m3
20827
20828 ; mode 15 [row 29, 0-7]
20829 pmaddubsw m3, m6, [r5 + 2 * 16]
20830 pmulhrsw m3, m7
20831 packuswb m3, m3
20832 movh [r0 + 890 * 16], m3
20833
20834 ; mode 15 [row 30, 0-7]
20835 pslldq m6, 2
20836 pinsrb m6, [r3 + 28], 1
20837 pinsrb m6, [r3 + 30], 0
20838 pmaddubsw m3, m6, [r5 + 17 * 16]
20839 pmulhrsw m3, m7
20840 packuswb m3, m3
20841 movh [r0 + 892 * 16], m3
20842
20843 ; mode 15 [row 31, 0-7]
20844 pshufb m3, m6, [tab_S2]
20845 movh [r0 + 894 * 16], m3
20846
20847 ; mode 12 [row 12]
20848 pslldq m0, 2
20849 pinsrb m0, [r3 + 6], 1
20850 pinsrb m0, [r3 + 13], 0
20851 pmaddubsw m3, m0, [r5 + 31 * 16]
20852 pmulhrsw m3, m7
20853 pmaddubsw m5, m2, [r5 + 31 * 16]
20854 pmulhrsw m5, m7
20855 packuswb m3, m5
20856 movu [r0 + 664 * 16], m3
20857 pmaddubsw m3, m1, [r5 + 31 * 16]
20858 pmulhrsw m3, m7
20859 pmaddubsw m5, m4, [r5 + 31 * 16]
20860 pmulhrsw m5, m7
20861 packuswb m3, m5
20862 movu [r0 + 665 * 16], m3
20863
20864 ; mode 12 [row 13]
20865 pmaddubsw m3, m0, [r5 + 26 * 16]
20866 pmulhrsw m3, m7
20867 pmaddubsw m5, m2, [r5 + 26 * 16]
20868 pmulhrsw m5, m7
20869 packuswb m3, m5
20870 movu [r0 + 666 * 16], m3
20871 pmaddubsw m3, m1, [r5 + 26 * 16]
20872 pmulhrsw m3, m7
20873 pmaddubsw m5, m4, [r5 + 26 * 16]
20874 pmulhrsw m5, m7
20875 packuswb m3, m5
20876 movu [r0 + 667 * 16], m3
20877
20878 ; mode 12 [row 14]
20879 pmaddubsw m3, m0, [r5 + 21 * 16]
20880 pmulhrsw m3, m7
20881 pmaddubsw m5, m2, [r5 + 21 * 16]
20882 pmulhrsw m5, m7
20883 packuswb m3, m5
20884 movu [r0 + 668 * 16], m3
20885 pmaddubsw m3, m1, [r5 + 21 * 16]
20886 pmulhrsw m3, m7
20887 pmaddubsw m5, m4, [r5 + 21 * 16]
20888 pmulhrsw m5, m7
20889 packuswb m3, m5
20890 movu [r0 + 669 * 16], m3
20891
20892 ; mode 12 [row 15]
20893 pmaddubsw m3, m0, [r5 + 16 * 16]
20894 pmulhrsw m3, m7
20895 pmaddubsw m5, m2, [r5 + 16 * 16]
20896 pmulhrsw m5, m7
20897 packuswb m3, m5
20898 movu [r0 + 670 * 16], m3
20899 pmaddubsw m3, m1, [r5 + 16 * 16]
20900 pmulhrsw m3, m7
20901 pmaddubsw m5, m4, [r5 + 16 * 16]
20902 pmulhrsw m5, m7
20903 packuswb m3, m5
20904 movu [r0 + 671 * 16], m3
20905
20906 ; mode 12 [row 16]
20907 pmaddubsw m3, m0, [r5 + 11 * 16]
20908 pmulhrsw m3, m7
20909 pmaddubsw m5, m2, [r5 + 11 * 16]
20910 pmulhrsw m5, m7
20911 packuswb m3, m5
20912 movu [r0 + 672 * 16], m3
20913 pmaddubsw m3, m1, [r5 + 11 * 16]
20914 pmulhrsw m3, m7
20915 pmaddubsw m5, m4, [r5 + 11 * 16]
20916 pmulhrsw m5, m7
20917 packuswb m3, m5
20918 movu [r0 + 673 * 16], m3
20919
20920 ; mode 12 [row 17]
20921 pmaddubsw m3, m0, [r5 + 6 * 16]
20922 pmulhrsw m3, m7
20923 pmaddubsw m5, m2, [r5 + 6 * 16]
20924 pmulhrsw m5, m7
20925 packuswb m3, m5
20926 movu [r0 + 674 * 16], m3
20927 pmaddubsw m3, m1, [r5 + 6 * 16]
20928 pmulhrsw m3, m7
20929 pmaddubsw m5, m4, [r5 + 6 * 16]
20930 pmulhrsw m5, m7
20931 packuswb m3, m5
20932 movu [r0 + 675 * 16], m3
20933
20934 ; mode 12 [row 18]
20935 pmaddubsw m3, m0, [r5 + 1 * 16]
20936 pmulhrsw m3, m7
20937 pmaddubsw m5, m2, [r5 + 1 * 16]
20938 pmulhrsw m5, m7
20939 packuswb m3, m5
20940 movu [r0 + 676 * 16], m3
20941 pmaddubsw m3, m1, [r5 + 1 * 16]
20942 pmulhrsw m3, m7
20943 pmaddubsw m5, m4, [r5 + 1 * 16]
20944 pmulhrsw m5, m7
20945 packuswb m3, m5
20946 movu [r0 + 677 * 16], m3
20947
20948 ; mode 13 [row 7]
20949 movu m6, m0
20950 pinsrb m6, [r3 + 4], 2
20951 pinsrb m6, [r3 + 4], 1
20952 pinsrb m6, [r3 + 7], 0
20953 pmaddubsw m3, m6, [r5 + 24 * 16]
20954 pmulhrsw m3, m7
20955 pmaddubsw m5, m2, [r5 + 24 * 16]
20956 pmulhrsw m5, m7
20957 packuswb m3, m5
20958 movu [r0 + 718 * 16], m3
20959 pmaddubsw m3, m1, [r5 + 24 * 16]
20960 pmulhrsw m3, m7
20961 pmaddubsw m5, m4, [r5 + 24 * 16]
20962 pmulhrsw m5, m7
20963 packuswb m3, m5
20964 movu [r0 + 719 * 16], m3
20965
20966 ; mode 13 [row 8]
20967 pmaddubsw m3, m6, [r5 + 15 * 16]
20968 pmulhrsw m3, m7
20969 pmaddubsw m5, m2, [r5 + 15 * 16]
20970 pmulhrsw m5, m7
20971 packuswb m3, m5
20972 movu [r0 + 720 * 16], m3
20973 pmaddubsw m3, m1, [r5 + 15 * 16]
20974 pmulhrsw m3, m7
20975 pmaddubsw m5, m4, [r5 + 15 * 16]
20976 pmulhrsw m5, m7
20977 packuswb m3, m5
20978 movu [r0 + 721 * 16], m3
20979
20980 ; mode 13 [row 9]
20981 pmaddubsw m3, m6, [r5 + 6 * 16]
20982 pmulhrsw m3, m7
20983 pmaddubsw m5, m2, [r5 + 6 * 16]
20984 pmulhrsw m5, m7
20985 packuswb m3, m5
20986 movu [r0 + 722 * 16], m3
20987 pmaddubsw m3, m1, [r5 + 6 * 16]
20988 pmulhrsw m3, m7
20989 pmaddubsw m5, m4, [r5 + 6 * 16]
20990 pmulhrsw m5, m7
20991 packuswb m3, m5
20992 movu [r0 + 723 * 16], m3
20993
20994 ; mode 14 [row 4]
20995 pinsrb m6, [r3 + 2], 2
20996 pinsrb m6, [r3 + 2], 1
20997 pinsrb m6, [r3 + 5], 0
20998 pmaddubsw m3, m6, [r5 + 31 * 16]
20999 pmulhrsw m3, m7
21000 pmaddubsw m5, m2, [r5 + 31 * 16]
21001 pmulhrsw m5, m7
21002 packuswb m3, m5
21003 movu [r0 + 776 * 16], m3
21004 pmaddubsw m3, m1, [r5 + 31 * 16]
21005 pmulhrsw m3, m7
21006 pmaddubsw m5, m4, [r5 + 31 * 16]
21007 pmulhrsw m5, m7
21008 packuswb m3, m5
21009 movu [r0 + 777 * 16], m3
21010
21011 ; mode 14 [row 5]
21012 pmaddubsw m3, m6, [r5 + 18 * 16]
21013 pmulhrsw m3, m7
21014 pmaddubsw m5, m2, [r5 + 18 * 16]
21015 pmulhrsw m5, m7
21016 packuswb m3, m5
21017 movu [r0 + 778 * 16], m3
21018 pmaddubsw m3, m1, [r5 + 18 * 16]
21019 pmulhrsw m3, m7
21020 pmaddubsw m5, m4, [r5 + 18 * 16]
21021 pmulhrsw m5, m7
21022 packuswb m3, m5
21023 movu [r0 + 779 * 16], m3
21024
21025 ; mode 14 [row 6]
21026 pmaddubsw m3, m6, [r5 + 5 * 16]
21027 pmulhrsw m3, m7
21028 pmaddubsw m5, m2, [r5 + 5 * 16]
21029 pmulhrsw m5, m7
21030 packuswb m3, m5
21031 movu [r0 + 780 * 16], m3
21032 pmaddubsw m3, m1, [r5 + 5 * 16]
21033 pmulhrsw m3, m7
21034 pmaddubsw m5, m4, [r5 + 5 * 16]
21035 pmulhrsw m5, m7
21036 packuswb m3, m5
21037 movu [r0 + 781 * 16], m3
21038
21039 ; mode 14 [row 7]
21040 pslldq m6, 2
21041 pinsrb m6, [r3 + 5], 1
21042 pinsrb m6, [r3 + 7], 0
21043 pmaddubsw m3, m6, [r5 + 24 * 16]
21044 pmulhrsw m3, m7
21045 pslldq m2, 2
21046 pinsrw m2, [r4 + 5], 0
21047 pmaddubsw m5, m2, [r5 + 24 * 16]
21048 pmulhrsw m5, m7
21049 packuswb m3, m5
21050 movu [r0 + 782 * 16], m3
21051 pslldq m1, 2
21052 pinsrw m1, [r4 + 13], 0
21053 pmaddubsw m3, m1, [r5 + 24 * 16]
21054 pmulhrsw m3, m7
21055 pslldq m4, 2
21056 pinsrw m4, [r4 + 21], 0
21057 pmaddubsw m5, m4, [r5 + 24 * 16]
21058 pmulhrsw m5, m7
21059 packuswb m3, m5
21060 movu [r0 + 783 * 16], m3
21061
21062 ; mode 14 [row 8]
21063 pmaddubsw m3, m6, [r5 + 11 * 16]
21064 pmulhrsw m3, m7
21065 pmaddubsw m5, m2, [r5 + 11 * 16]
21066 pmulhrsw m5, m7
21067 packuswb m3, m5
21068 movu [r0 + 784 * 16], m3
21069 pmaddubsw m3, m1, [r5 + 11 * 16]
21070 pmulhrsw m3, m7
21071 pmaddubsw m5, m4, [r5 + 11 * 16]
21072 pmulhrsw m5, m7
21073 packuswb m3, m5
21074 movu [r0 + 785 * 16], m3
21075
21076 ; mode 15 [row 5, 8-31]
21077 pmaddubsw m5, m2, [r5 + 26 * 16]
21078 pmulhrsw m5, m7
21079 packuswb m5, m5
21080 movh [r0 + 842 * 16 + 8], m5
21081 pmaddubsw m3, m1, [r5 + 26 * 16]
21082 pmulhrsw m3, m7
21083 pmaddubsw m5, m4, [r5 + 26 * 16]
21084 pmulhrsw m5, m7
21085 packuswb m3, m5
21086 movu [r0 + 843 * 16], m3
21087
21088 ; mode 15 [row 6, 8-31]
21089 pmaddubsw m5, m2, [r5 + 9 * 16]
21090 pmulhrsw m5, m7
21091 packuswb m5, m5
21092 movh [r0 + 844 * 16 + 8], m5
21093 pmaddubsw m3, m1, [r5 + 9 * 16]
21094 pmulhrsw m3, m7
21095 pmaddubsw m5, m4, [r5 + 9 * 16]
21096 pmulhrsw m5, m7
21097 packuswb m3, m5
21098 movu [r0 + 845 * 16], m3
21099
21100 ; mode 12 [row 19]
21101 pslldq m0, 2
21102 pinsrb m0, [r3 + 13], 1
21103 pinsrb m0, [r3 + 19], 0
21104 pmaddubsw m3, m0, [r5 + 28 * 16]
21105 pmulhrsw m3, m7
21106 pmaddubsw m5, m2, [r5 + 28 * 16]
21107 pmulhrsw m5, m7
21108 packuswb m3, m5
21109 movu [r0 + 678 * 16], m3
21110 pmaddubsw m3, m1, [r5 + 28 * 16]
21111 pmulhrsw m3, m7
21112 pmaddubsw m5, m4, [r5 + 28 * 16]
21113 pmulhrsw m5, m7
21114 packuswb m3, m5
21115 movu [r0 + 679 * 16], m3
21116
21117 ; mode 12 [row 20]
21118 pmaddubsw m3, m0, [r5 + 23 * 16]
21119 pmulhrsw m3, m7
21120 pmaddubsw m5, m2, [r5 + 23 * 16]
21121 pmulhrsw m5, m7
21122 packuswb m3, m5
21123 movu [r0 + 680 * 16], m3
21124 pmaddubsw m3, m1, [r5 + 23 * 16]
21125 pmulhrsw m3, m7
21126 pmaddubsw m5, m4, [r5 + 23 * 16]
21127 pmulhrsw m5, m7
21128 packuswb m3, m5
21129 movu [r0 + 681 * 16], m3
21130
21131 ; mode 12 [row 21]
21132 pmaddubsw m3, m0, [r5 + 18 * 16]
21133 pmulhrsw m3, m7
21134 pmaddubsw m5, m2, [r5 + 18 * 16]
21135 pmulhrsw m5, m7
21136 packuswb m3, m5
21137 movu [r0 + 682 * 16], m3
21138 pmaddubsw m3, m1, [r5 + 18 * 16]
21139 pmulhrsw m3, m7
21140 pmaddubsw m5, m4, [r5 + 18 * 16]
21141 pmulhrsw m5, m7
21142 packuswb m3, m5
21143 movu [r0 + 683 * 16], m3
21144
21145 ; mode 12 [row 22]
21146 pmaddubsw m3, m0, [r5 + 13 * 16]
21147 pmulhrsw m3, m7
21148 pmaddubsw m5, m2, [r5 + 13 * 16]
21149 pmulhrsw m5, m7
21150 packuswb m3, m5
21151 movu [r0 + 684 * 16], m3
21152 pmaddubsw m3, m1, [r5 + 13 * 16]
21153 pmulhrsw m3, m7
21154 pmaddubsw m5, m4, [r5 + 13 * 16]
21155 pmulhrsw m5, m7
21156 packuswb m3, m5
21157 movu [r0 + 685 * 16], m3
21158
21159 ; mode 12 [row 23]
21160 pmaddubsw m3, m0, [r5 + 8 * 16]
21161 pmulhrsw m3, m7
21162 pmaddubsw m5, m2, [r5 + 8 * 16]
21163 pmulhrsw m5, m7
21164 packuswb m3, m5
21165 movu [r0 + 686 * 16], m3
21166 pmaddubsw m3, m1, [r5 + 8 * 16]
21167 pmulhrsw m3, m7
21168 pmaddubsw m5, m4, [r5 + 8 * 16]
21169 pmulhrsw m5, m7
21170 packuswb m3, m5
21171 movu [r0 + 687 * 16], m3
21172
21173 ; mode 12 [row 24]
21174 pmaddubsw m3, m0, [r5 + 3 * 16]
21175 pmulhrsw m3, m7
21176 pmaddubsw m5, m2, [r5 + 3 * 16]
21177 pmulhrsw m5, m7
21178 packuswb m3, m5
21179 movu [r0 + 688 * 16], m3
21180 pmaddubsw m3, m1, [r5 + 3 * 16]
21181 pmulhrsw m3, m7
21182 pmaddubsw m5, m4, [r5 + 3 * 16]
21183 pmulhrsw m5, m7
21184 packuswb m3, m5
21185 movu [r0 + 689 * 16], m3
21186
21187 ; mode 13 [row 10]
21188 movu m7, m6
21189 movu m6, m0
21190 pinsrb m6, [r3 + 4], 4
21191 pinsrb m6, [r3 + 4], 3
21192 pinsrb m6, [r3 + 7], 2
21193 pinsrb m6, [r3 + 7], 1
21194 pinsrb m6, [r3 + 11], 0
21195 pmaddubsw m3, m6, [r5 + 29 * 16]
21196 pmulhrsw m3, [pw_1024]
21197 pmaddubsw m5, m2, [r5 + 29 * 16]
21198 pmulhrsw m5, [pw_1024]
21199 packuswb m3, m5
21200 movu [r0 + 724 * 16], m3
21201 pmaddubsw m3, m1, [r5 + 29 * 16]
21202 pmulhrsw m3, [pw_1024]
21203 pmaddubsw m5, m4, [r5 + 29 * 16]
21204 pmulhrsw m5, [pw_1024]
21205 packuswb m3, m5
21206 movu [r0 + 725 * 16], m3
21207
21208 ; mode 13 [row 11]
21209 pmaddubsw m3, m6, [r5 + 20 * 16]
21210 pmulhrsw m3, [pw_1024]
21211 pmaddubsw m5, m2, [r5 + 20 * 16]
21212 pmulhrsw m5, [pw_1024]
21213 packuswb m3, m5
21214 movu [r0 + 726 * 16], m3
21215 pmaddubsw m3, m1, [r5 + 20 * 16]
21216 pmulhrsw m3, [pw_1024]
21217 pmaddubsw m5, m4, [r5 + 20 * 16]
21218 pmulhrsw m5, [pw_1024]
21219 packuswb m3, m5
21220 movu [r0 + 727 * 16], m3
21221
21222 ; mode 13 [row 12]
21223 pmaddubsw m3, m6, [r5 + 11 * 16]
21224 pmulhrsw m3, [pw_1024]
21225 pmaddubsw m5, m2, [r5 + 11 * 16]
21226 pmulhrsw m5, [pw_1024]
21227 packuswb m3, m5
21228 movu [r0 + 728 * 16], m3
21229 pmaddubsw m3, m1, [r5 + 11 * 16]
21230 pmulhrsw m3, [pw_1024]
21231 pmaddubsw m5, m4, [r5 + 11 * 16]
21232 pmulhrsw m5, [pw_1024]
21233 packuswb m3, m5
21234 movu [r0 + 729 * 16], m3
21235
21236 ; mode 13 [row 13]
21237 pmaddubsw m3, m6, [r5 + 2 * 16]
21238 pmulhrsw m3, [pw_1024]
21239 pmaddubsw m5, m2, [r5 + 2 * 16]
21240 pmulhrsw m5, [pw_1024]
21241 packuswb m3, m5
21242 movu [r0 + 730 * 16], m3
21243 pmaddubsw m3, m1, [r5 + 2 * 16]
21244 pmulhrsw m3, [pw_1024]
21245 pmaddubsw m5, m4, [r5 + 2 * 16]
21246 pmulhrsw m5, [pw_1024]
21247 packuswb m3, m5
21248 movu [r0 + 731 * 16], m3
21249
21250 ; mode 14 [row 9]
21251 pslldq m7, 2
21252 pinsrb m7, [r3 + 7], 1
21253 pinsrb m7, [r3 + 10], 0
21254 pmaddubsw m3, m7, [r5 + 30 * 16]
21255 pmulhrsw m3, [pw_1024]
21256 pslldq m2, 2
21257 pinsrw m2, [r4 + 4], 0
21258 pmaddubsw m5, m2, [r5 + 30 * 16]
21259 pmulhrsw m5, [pw_1024]
21260 packuswb m3, m5
21261 movu [r0 + 786 * 16], m3
21262 pslldq m1, 2
21263 pinsrw m1, [r4 + 12], 0
21264 pmaddubsw m3, m1, [r5 + 30 * 16]
21265 pmulhrsw m3, [pw_1024]
21266 pslldq m4, 2
21267 pinsrb m4, [r4 + 21], 1
21268 pinsrb m4, [r4 + 20], 0
21269 pmaddubsw m5, m4, [r5 + 30 * 16]
21270 pmulhrsw m5, [pw_1024]
21271 packuswb m3, m5
21272 movu [r0 + 787 * 16], m3
21273
21274 ; mode 14 [row 10]
21275 pmaddubsw m3, m7, [r5 + 17 * 16]
21276 pmulhrsw m3, [pw_1024]
21277 pmaddubsw m5, m2, [r5 + 17 * 16]
21278 pmulhrsw m5, [pw_1024]
21279 packuswb m3, m5
21280 movu [r0 + 788 * 16], m3
21281 pmaddubsw m3, m1, [r5 + 17 * 16]
21282 pmulhrsw m3, [pw_1024]
21283 pmaddubsw m5, m4, [r5 + 17 * 16]
21284 pmulhrsw m5, [pw_1024]
21285 packuswb m3, m5
21286 movu [r0 + 789 * 16], m3
21287
21288 ; mode 14 [row 11]
21289 pmaddubsw m3, m7, [r5 + 4 * 16]
21290 pmulhrsw m3, [pw_1024]
21291 pmaddubsw m5, m2, [r5 + 4 * 16]
21292 pmulhrsw m5, [pw_1024]
21293 packuswb m3, m5
21294 movu [r0 + 790 * 16], m3
21295 pmaddubsw m3, m1, [r5 + 4 * 16]
21296 pmulhrsw m3, [pw_1024]
21297 pmaddubsw m5, m4, [r5 + 4 * 16]
21298 pmulhrsw m5, [pw_1024]
21299 packuswb m3, m5
21300 movu [r0 + 791 * 16], m3
21301
21302 movu m6, [pw_1024]
21303
21304 ; mode 15 [row 7, 8-31]
21305 pmaddubsw m5, m2, [r5 + 24 * 16]
21306 pmulhrsw m5, m6
21307 packuswb m5, m5
21308 movh [r0 + 846 * 16 + 8], m5
21309 pmaddubsw m3, m1, [r5 + 24 * 16]
21310 pmulhrsw m3, m6
21311 pmaddubsw m5, m4, [r5 + 24 * 16]
21312 pmulhrsw m5, m6
21313 packuswb m3, m5
21314 movu [r0 + 847 * 16], m3
21315
21316 ; mode 15 [row 8, 8-31]
21317 pmaddubsw m5, m2, [r5 + 7 * 16]
21318 pmulhrsw m5, m6
21319 packuswb m5, m5
21320 movh [r0 + 848 * 16 + 8], m5
21321 pmaddubsw m3, m1, [r5 + 7 * 16]
21322 pmulhrsw m3, m6
21323 pmaddubsw m5, m4, [r5 + 7 * 16]
21324 pmulhrsw m5, m6
21325 packuswb m3, m5
21326 movu [r0 + 849 * 16], m3
21327
21328 ; mode 12 [row 25]
21329 pslldq m0, 2
21330 pinsrb m0, [r3 + 19], 1
21331 pinsrb m0, [r3 + 26], 0
21332 pmaddubsw m3, m0, [r5 + 30 * 16]
21333 pmulhrsw m3, [pw_1024]
21334 pmaddubsw m5, m2, [r5 + 30 * 16]
21335 pmulhrsw m5, [pw_1024]
21336 packuswb m3, m5
21337 movu [r0 + 690 * 16], m3
21338 pmaddubsw m3, m1, [r5 + 30 * 16]
21339 pmulhrsw m3, [pw_1024]
21340 pmaddubsw m5, m4, [r5 + 30 * 16]
21341 pmulhrsw m5, [pw_1024]
21342 packuswb m3, m5
21343 movu [r0 + 691 * 16], m3
21344
21345 ; mode 12 [row 26]
21346 pmaddubsw m3, m0, [r5 + 25 * 16]
21347 pmulhrsw m3, [pw_1024]
21348 pmaddubsw m5, m2, [r5 + 25 * 16]
21349 pmulhrsw m5, [pw_1024]
21350 packuswb m3, m5
21351 movu [r0 + 692 * 16], m3
21352 pmaddubsw m3, m1, [r5 + 25 * 16]
21353 pmulhrsw m3, [pw_1024]
21354 pmaddubsw m5, m4, [r5 + 25 * 16]
21355 pmulhrsw m5, [pw_1024]
21356 packuswb m3, m5
21357 movu [r0 + 693 * 16], m3
21358
21359 ; mode 12 [row 27]
21360 pmaddubsw m3, m0, [r5 + 20 * 16]
21361 pmulhrsw m3, [pw_1024]
21362 pmaddubsw m5, m2, [r5 + 20 * 16]
21363 pmulhrsw m5, [pw_1024]
21364 packuswb m3, m5
21365 movu [r0 + 694 * 16], m3
21366 pmaddubsw m3, m1, [r5 + 20 * 16]
21367 pmulhrsw m3, [pw_1024]
21368 pmaddubsw m5, m4, [r5 + 20 * 16]
21369 pmulhrsw m5, [pw_1024]
21370 packuswb m3, m5
21371 movu [r0 + 695 * 16], m3
21372
21373 ; mode 12 [row 28]
21374 pmaddubsw m3, m0, [r5 + 15 * 16]
21375 pmulhrsw m3, [pw_1024]
21376 pmaddubsw m5, m2, [r5 + 15 * 16]
21377 pmulhrsw m5, [pw_1024]
21378 packuswb m3, m5
21379 movu [r0 + 696 * 16], m3
21380 pmaddubsw m3, m1, [r5 + 15 * 16]
21381 pmulhrsw m3, [pw_1024]
21382 pmaddubsw m5, m4, [r5 + 15 * 16]
21383 pmulhrsw m5, [pw_1024]
21384 packuswb m3, m5
21385 movu [r0 + 697 * 16], m3
21386
21387 ; mode 12 [row 29]
21388 pmaddubsw m3, m0, [r5 + 10 * 16]
21389 pmulhrsw m3, [pw_1024]
21390 pmaddubsw m5, m2, [r5 + 10 * 16]
21391 pmulhrsw m5, [pw_1024]
21392 packuswb m3, m5
21393 movu [r0 + 698 * 16], m3
21394 pmaddubsw m3, m1, [r5 + 10 * 16]
21395 pmulhrsw m3, [pw_1024]
21396 pmaddubsw m5, m4, [r5 + 10 * 16]
21397 pmulhrsw m5, [pw_1024]
21398 packuswb m3, m5
21399 movu [r0 + 699 * 16], m3
21400
21401 ; mode 12 [row 30]
21402 pmaddubsw m3, m0, [r5 + 5 * 16]
21403 pmulhrsw m3, [pw_1024]
21404 pmaddubsw m5, m2, [r5 + 5 * 16]
21405 pmulhrsw m5, [pw_1024]
21406 packuswb m3, m5
21407 movu [r0 + 700 * 16], m3
21408 pmaddubsw m3, m1, [r5 + 5 * 16]
21409 pmulhrsw m3, [pw_1024]
21410 pmaddubsw m5, m4, [r5 + 5 * 16]
21411 pmulhrsw m5, [pw_1024]
21412 packuswb m3, m5
21413 movu [r0 + 701 * 16], m3
21414
21415 ; mode 13 [row 14]
21416 movu m6, m0
21417 pinsrb m6, [r3 + 4], 6
21418 pinsrb m6, [r3 + 4], 5
21419 pinsrb m6, [r3 + 7], 4
21420 pinsrb m6, [r3 + 7], 3
21421 pinsrb m6, [r3 + 11], 2
21422 pinsrb m6, [r3 + 11], 1
21423 pinsrb m6, [r3 + 14], 0
21424 pmaddubsw m3, m6, [r5 + 25 * 16]
21425 pmulhrsw m3, [pw_1024]
21426 pmaddubsw m5, m2, [r5 + 25 * 16]
21427 pmulhrsw m5, [pw_1024]
21428 packuswb m3, m5
21429 movu [r0 + 732 * 16], m3
21430 pmaddubsw m3, m1, [r5 + 25 * 16]
21431 pmulhrsw m3, [pw_1024]
21432 pmaddubsw m5, m4, [r5 + 25 * 16]
21433 pmulhrsw m5, [pw_1024]
21434 packuswb m3, m5
21435 movu [r0 + 733 * 16], m3
21436
21437 ; mode 13 [row 15]
21438 pmaddubsw m3, m6, [r5 + 16 * 16]
21439 pmulhrsw m3, [pw_1024]
21440 pmaddubsw m5, m2, [r5 + 16 * 16]
21441 pmulhrsw m5, [pw_1024]
21442 packuswb m3, m5
21443 movu [r0 + 734 * 16], m3
21444 pmaddubsw m3, m1, [r5 + 16 * 16]
21445 pmulhrsw m3, [pw_1024]
21446 pmaddubsw m5, m4, [r5 + 16 * 16]
21447 pmulhrsw m5, [pw_1024]
21448 packuswb m3, m5
21449 movu [r0 + 735 * 16], m3
21450
21451 ; mode 13 [row 16]
21452 pmaddubsw m3, m6, [r5 + 7 * 16]
21453 pmulhrsw m3, [pw_1024]
21454 pmaddubsw m5, m2, [r5 + 7 * 16]
21455 pmulhrsw m5, [pw_1024]
21456 packuswb m3, m5
21457 movu [r0 + 736 * 16], m3
21458 pmaddubsw m3, m1, [r5 + 7 * 16]
21459 pmulhrsw m3, [pw_1024]
21460 pmaddubsw m5, m4, [r5 + 7 * 16]
21461 pmulhrsw m5, [pw_1024]
21462 packuswb m3, m5
21463 movu [r0 + 737 * 16], m3
21464
21465 ; mode 13 [row 17]
21466 pslldq m6, 2
21467 pinsrb m6, [r3 + 14], 1
21468 pinsrb m6, [r3 + 18], 0
21469 pmaddubsw m3, m6, [r5 + 30 * 16]
21470 pmulhrsw m3, [pw_1024]
21471 pslldq m2, 2
21472 pinsrw m2, [r4 + 3], 0
21473 pmaddubsw m5, m2, [r5 + 30 * 16]
21474 pmulhrsw m5, [pw_1024]
21475 packuswb m3, m5
21476 movu [r0 + 738 * 16], m3
21477 pslldq m1, 2
21478 pinsrw m1, [r4 + 11], 0
21479 pmaddubsw m3, m1, [r5 + 30 * 16]
21480 pmulhrsw m3, [pw_1024]
21481 pslldq m4, 2
21482 pinsrw m4, [r4 + 19], 0
21483 pmaddubsw m5, m4, [r5 + 30 * 16]
21484 pmulhrsw m5, [pw_1024]
21485 packuswb m3, m5
21486 movu [r0 + 739 * 16], m3
21487
21488 ; mode 13 [row 18]
21489 pmaddubsw m3, m6, [r5 + 21 * 16]
21490 pmulhrsw m3, [pw_1024]
21491 pmaddubsw m5, m2, [r5 + 21 * 16]
21492 pmulhrsw m5, [pw_1024]
21493 packuswb m3, m5
21494 movu [r0 + 740 * 16], m3
21495 pmaddubsw m3, m1, [r5 + 21 * 16]
21496 pmulhrsw m3, [pw_1024]
21497 pmaddubsw m5, m4, [r5 + 21 * 16]
21498 pmulhrsw m5, [pw_1024]
21499 packuswb m3, m5
21500 movu [r0 + 741 * 16], m3
21501
21502 ; mode 13 [row 19]
21503 pmaddubsw m3, m6, [r5 + 12 * 16]
21504 pmulhrsw m3, [pw_1024]
21505 pmaddubsw m5, m2, [r5 + 12 * 16]
21506 pmulhrsw m5, [pw_1024]
21507 packuswb m3, m5
21508 movu [r0 + 742 * 16], m3
21509 pmaddubsw m3, m1, [r5 + 12 * 16]
21510 pmulhrsw m3, [pw_1024]
21511 pmaddubsw m5, m4, [r5 + 12 * 16]
21512 pmulhrsw m5, [pw_1024]
21513 packuswb m3, m5
21514 movu [r0 + 743 * 16], m3
21515
21516 ; mode 13 [row 20]
21517 pmaddubsw m3, m6, [r5 + 3 * 16]
21518 pmulhrsw m3, [pw_1024]
21519 pmaddubsw m5, m2, [r5 + 3 * 16]
21520 pmulhrsw m5, [pw_1024]
21521 packuswb m3, m5
21522 movu [r0 + 744 * 16], m3
21523 pmaddubsw m3, m1, [r5 + 3 * 16]
21524 pmulhrsw m3, [pw_1024]
21525 pmaddubsw m5, m4, [r5 + 3 * 16]
21526 pmulhrsw m5, [pw_1024]
21527 packuswb m3, m5
21528 movu [r0 + 745 * 16], m3
21529
21530 ; mode 14 [row 12]
21531 pslldq m7, 2
21532 pinsrb m7, [r3 + 10], 1
21533 pinsrb m7, [r3 + 12], 0
21534 pmaddubsw m3, m7, [r5 + 23 * 16]
21535 pmulhrsw m3, [pw_1024]
21536 pmaddubsw m5, m2, [r5 + 23 * 16]
21537 pmulhrsw m5, [pw_1024]
21538 packuswb m3, m5
21539 movu [r0 + 792 * 16], m3
21540 pmaddubsw m3, m1, [r5 + 23 * 16]
21541 pmulhrsw m3, [pw_1024]
21542 pmaddubsw m5, m4, [r5 + 23 * 16]
21543 pmulhrsw m5, [pw_1024]
21544 packuswb m3, m5
21545 movu [r0 + 793 * 16], m3
21546
21547 ; mode 14 [row 13]
21548 pmaddubsw m3, m7, [r5 + 10 * 16]
21549 pmulhrsw m3, [pw_1024]
21550 pmaddubsw m5, m2, [r5 + 10 * 16]
21551 pmulhrsw m5, [pw_1024]
21552 packuswb m3, m5
21553 movu [r0 + 794 * 16], m3
21554 pmaddubsw m3, m1, [r5 + 10 * 16]
21555 pmulhrsw m3, [pw_1024]
21556 pmaddubsw m5, m4, [r5 + 10 * 16]
21557 pmulhrsw m5, [pw_1024]
21558 packuswb m3, m5
21559 movu [r0 + 795 * 16], m3
21560
21561 ; mode 15 [row 9]
21562 pmaddubsw m5, m2, [r5 + 22 * 16]
21563 pmulhrsw m5, [pw_1024]
21564 packuswb m5, m5
21565 movu [r0 + 850 * 16 + 8], m5
21566 pmaddubsw m3, m1, [r5 + 22 * 16]
21567 pmulhrsw m3, [pw_1024]
21568 pmaddubsw m5, m4, [r5 + 22 * 16]
21569 pmulhrsw m5, [pw_1024]
21570 packuswb m3, m5
21571 movu [r0 + 851 * 16], m3
21572
21573 ; mode 15 [row 10]
21574 pmaddubsw m5, m2, [r5 + 5 * 16]
21575 pmulhrsw m5, [pw_1024]
21576 packuswb m5, m5
21577 movu [r0 + 852 * 16 + 8], m5
21578 pmaddubsw m3, m1, [r5 + 5 * 16]
21579 pmulhrsw m3, [pw_1024]
21580 pmaddubsw m5, m4, [r5 + 5 * 16]
21581 pmulhrsw m5, [pw_1024]
21582 packuswb m3, m5
21583 movu [r0 + 853 * 16], m3
21584
21585 ; mode 13 [row 21]
21586 pslldq m6, 2
21587 pinsrb m6, [r3 + 18], 1
21588 pinsrb m6, [r3 + 21], 0
21589 pmaddubsw m3, m6, [r5 + 26 * 16]
21590 pmulhrsw m3, [pw_1024]
21591 pslldq m2, 2
21592 pinsrw m2, [r4 + 2], 0
21593 pmaddubsw m5, m2, [r5 + 26 * 16]
21594 pmulhrsw m5, [pw_1024]
21595 packuswb m3, m5
21596 movu [r0 + 746 * 16], m3
21597 pslldq m1, 2
21598 pinsrw m1, [r4 + 10], 0
21599 pmaddubsw m3, m1, [r5 + 26 * 16]
21600 pmulhrsw m3, [pw_1024]
21601 pslldq m4, 2
21602 pinsrw m4, [r4 + 18], 0
21603 pmaddubsw m5, m4, [r5 + 26 * 16]
21604 pmulhrsw m5, [pw_1024]
21605 packuswb m3, m5
21606 movu [r0 + 747 * 16], m3
21607
21608 ; mode 13 [row 22]
21609 pmaddubsw m3, m6, [r5 + 17 * 16]
21610 pmulhrsw m3, [pw_1024]
21611 pmaddubsw m5, m2, [r5 + 17 * 16]
21612 pmulhrsw m5, [pw_1024]
21613 packuswb m3, m5
21614 movu [r0 + 748 * 16], m3
21615 pmaddubsw m3, m1, [r5 + 17 * 16]
21616 pmulhrsw m3, [pw_1024]
21617 pmaddubsw m5, m4, [r5 + 17 * 16]
21618 pmulhrsw m5, [pw_1024]
21619 packuswb m3, m5
21620 movu [r0 + 749 * 16], m3
21621
21622 ; mode 13 [row 23]
21623 pmaddubsw m3, m6, [r5 + 8 * 16]
21624 pmulhrsw m3, [pw_1024]
21625 pmaddubsw m5, m2, [r5 + 8 * 16]
21626 pmulhrsw m5, [pw_1024]
21627 packuswb m3, m5
21628 movu [r0 + 750 * 16], m3
21629 pmaddubsw m3, m1, [r5 + 8 * 16]
21630 pmulhrsw m3, [pw_1024]
21631 pmaddubsw m5, m4, [r5 + 8 * 16]
21632 pmulhrsw m5, [pw_1024]
21633 packuswb m3, m5
21634 movu [r0 + 751 * 16], m3
21635
21636 ; mode 14 [row 14]
21637 pslldq m7, 2
21638 pinsrb m7, [r3 + 12], 1
21639 pinsrb m7, [r3 + 15], 0
21640 pmaddubsw m3, m7, [r5 + 29 * 16]
21641 pmulhrsw m3, [pw_1024]
21642 pmaddubsw m5, m2, [r5 + 29 * 16]
21643 pmulhrsw m5, [pw_1024]
21644 packuswb m3, m5
21645 movu [r0 + 796 * 16], m3
21646 pmaddubsw m3, m1, [r5 + 29 * 16]
21647 pmulhrsw m3, [pw_1024]
21648 pmaddubsw m5, m4, [r5 + 29 * 16]
21649 pmulhrsw m5, [pw_1024]
21650 packuswb m3, m5
21651 movu [r0 + 797 * 16], m3
21652
21653 ; mode 14 [row 15]
21654 pmaddubsw m3, m7, [r5 + 16 * 16]
21655 pmulhrsw m3, [pw_1024]
21656 pmaddubsw m5, m2, [r5 + 16 * 16]
21657 pmulhrsw m5, [pw_1024]
21658 packuswb m3, m5
21659 movu [r0 + 798 * 16], m3
21660 pmaddubsw m3, m1, [r5 + 16 * 16]
21661 pmulhrsw m3, [pw_1024]
21662 pmaddubsw m5, m4, [r5 + 16 * 16]
21663 pmulhrsw m5, [pw_1024]
21664 packuswb m3, m5
21665 movu [r0 + 799 * 16], m3
21666
21667 ; mode 14 [row 16]
21668 pmaddubsw m3, m7, [r5 + 3 * 16]
21669 pmulhrsw m3, [pw_1024]
21670 pmaddubsw m5, m2, [r5 + 3 * 16]
21671 pmulhrsw m5, [pw_1024]
21672 packuswb m3, m5
21673 movu [r0 + 800 * 16], m3
21674 pmaddubsw m3, m1, [r5 + 3 * 16]
21675 pmulhrsw m3, [pw_1024]
21676 pmaddubsw m5, m4, [r5 + 3 * 16]
21677 pmulhrsw m5, [pw_1024]
21678 packuswb m3, m5
21679 movu [r0 + 801 * 16], m3
21680
21681 ; mode 15 [row 11]
21682 pmaddubsw m5, m2, [r5 + 20 * 16]
21683 pmulhrsw m5, [pw_1024]
21684 packuswb m5, m5
21685 movh [r0 + 854 * 16 + 8], m5
21686 pmaddubsw m3, m1, [r5 + 20 * 16]
21687 pmulhrsw m3, [pw_1024]
21688 pmaddubsw m5, m4, [r5 + 20 * 16]
21689 pmulhrsw m5, [pw_1024]
21690 packuswb m3, m5
21691 movu [r0 + 855 * 16], m3
21692
21693 ; mode 15 [row 12]
21694 pmaddubsw m5, m2, [r5 + 3 * 16]
21695 pmulhrsw m5, [pw_1024]
21696 packuswb m5, m5
21697 movh [r0 + 856 * 16 + 8], m5
21698 pmaddubsw m3, m1, [r5 + 3 * 16]
21699 pmulhrsw m3, [pw_1024]
21700 pmaddubsw m5, m4, [r5 + 3 * 16]
21701 pmulhrsw m5, [pw_1024]
21702 packuswb m3, m5
21703 movu [r0 + 857 * 16], m3
21704
21705 ; mode 13 [row 24]
21706 pslldq m6, 2
21707 pinsrb m6, [r3 + 21], 1
21708 pinsrb m6, [r3 + 25], 0
21709 pmaddubsw m3, m6, [r5 + 31 * 16]
21710 pmulhrsw m3, [pw_1024]
21711 pslldq m2, 2
21712 pinsrw m2, [r4 + 1], 0
21713 pmaddubsw m5, m2, [r5 + 31 * 16]
21714 pmulhrsw m5, [pw_1024]
21715 packuswb m3, m5
21716 movu [r0 + 752 * 16], m3
21717 pslldq m1, 2
21718 pinsrw m1, [r4 + 9], 0
21719 pmaddubsw m3, m1, [r5 + 31 * 16]
21720 pmulhrsw m3, [pw_1024]
21721 pslldq m4, 2
21722 pinsrw m4, [r4 + 17], 0
21723 pmaddubsw m5, m4, [r5 + 31 * 16]
21724 pmulhrsw m5, [pw_1024]
21725 packuswb m3, m5
21726 movu [r0 + 753 * 16], m3
21727
21728 ; mode 13 [row 25]
21729 pmaddubsw m3, m6, [r5 + 22 * 16]
21730 pmulhrsw m3, [pw_1024]
21731 pmaddubsw m5, m2, [r5 + 22 * 16]
21732 pmulhrsw m5, [pw_1024]
21733 packuswb m3, m5
21734 movu [r0 + 754 * 16], m3
21735 pmaddubsw m3, m1, [r5 + 22 * 16]
21736 pmulhrsw m3, [pw_1024]
21737 pmaddubsw m5, m4, [r5 + 22 * 16]
21738 pmulhrsw m5, [pw_1024]
21739 packuswb m3, m5
21740 movu [r0 + 755 * 16], m3
21741
21742 ; mode 13 [row 26]
21743 pmaddubsw m3, m6, [r5 + 13 * 16]
21744 pmulhrsw m3, [pw_1024]
21745 pmaddubsw m5, m2, [r5 + 13 * 16]
21746 pmulhrsw m5, [pw_1024]
21747 packuswb m3, m5
21748 movu [r0 + 756 * 16], m3
21749 pmaddubsw m3, m1, [r5 + 13 * 16]
21750 pmulhrsw m3, [pw_1024]
21751 pmaddubsw m5, m4, [r5 + 13 * 16]
21752 pmulhrsw m5, [pw_1024]
21753 packuswb m3, m5
21754 movu [r0 + 757 * 16], m3
21755
21756 ; mode 13 [row 27]
21757 pmaddubsw m3, m6, [r5 + 4 * 16]
21758 pmulhrsw m3, [pw_1024]
21759 pmaddubsw m5, m2, [r5 + 4 * 16]
21760 pmulhrsw m5, [pw_1024]
21761 packuswb m3, m5
21762 movu [r0 + 758 * 16], m3
21763 pmaddubsw m3, m1, [r5 + 4 * 16]
21764 pmulhrsw m3, [pw_1024]
21765 pmaddubsw m5, m4, [r5 + 4 * 16]
21766 pmulhrsw m5, [pw_1024]
21767 packuswb m3, m5
21768 movu [r0 + 759 * 16], m3
21769
21770 ; mode 14 [row 17]
21771 pslldq m7, 2
21772 pinsrb m7, [r3 + 15], 1
21773 pinsrb m7, [r3 + 17], 0
21774 pmaddubsw m3, m7, [r5 + 22 * 16]
21775 pmulhrsw m3, [pw_1024]
21776 pmaddubsw m5, m2, [r5 + 22 * 16]
21777 pmulhrsw m5, [pw_1024]
21778 packuswb m3, m5
21779 movu [r0 + 802 * 16], m3
21780 pmaddubsw m3, m1, [r5 + 22 * 16]
21781 pmulhrsw m3, [pw_1024]
21782 pmaddubsw m5, m4, [r5 + 22 * 16]
21783 pmulhrsw m5, [pw_1024]
21784 packuswb m3, m5
21785 movu [r0 + 803 * 16], m3
21786
21787 ; mode 14 [row 18]
21788 pmaddubsw m3, m7, [r5 + 9 * 16]
21789 pmulhrsw m3, [pw_1024]
21790 pmaddubsw m5, m2, [r5 + 9 * 16]
21791 pmulhrsw m5, [pw_1024]
21792 packuswb m3, m5
21793 movu [r0 + 804 * 16], m3
21794 pmaddubsw m3, m1, [r5 + 9 * 16]
21795 pmulhrsw m3, [pw_1024]
21796 pmaddubsw m5, m4, [r5 + 9 * 16]
21797 pmulhrsw m5, [pw_1024]
21798 packuswb m3, m5
21799 movu [r0 + 805 * 16], m3
21800
21801 ; mode 15 [row 13]
21802 pmaddubsw m5, m2, [r5 + 18 * 16]
21803 pmulhrsw m5, [pw_1024]
21804 packuswb m5, m5
21805 movh [r0 + 858 * 16 + 8], m5
21806 pmaddubsw m3, m1, [r5 + 18 * 16]
21807 pmulhrsw m3, [pw_1024]
21808 pmaddubsw m5, m4, [r5 + 18 * 16]
21809 pmulhrsw m5, [pw_1024]
21810 packuswb m3, m5
21811 movu [r0 + 859 * 16], m3
21812
21813 ; mode 15 [row 14]
21814 pmaddubsw m5, m2, [r5 + 1 * 16]
21815 pmulhrsw m5, [pw_1024]
21816 packuswb m5, m5
21817 movh [r0 + 860 * 16 + 8], m5
21818 pmaddubsw m3, m1, [r5 + 1 * 16]
21819 pmulhrsw m3, [pw_1024]
21820 pmaddubsw m5, m4, [r5 + 1 * 16]
21821 pmulhrsw m5, [pw_1024]
21822 packuswb m3, m5
21823 movu [r0 + 861 * 16], m3
21824
21825 ; mode 13 [row 28]
21826 pslldq m6, 2
21827 pinsrb m6, [r3 + 25], 1
21828 pinsrb m6, [r3 + 28], 0
21829 pmaddubsw m3, m6, [r5 + 27 * 16]
21830 pmulhrsw m3, [pw_1024]
21831 pslldq m2, 2
21832 pinsrw m2, [r4 + 0], 0
21833 pmaddubsw m5, m2, [r5 + 27 * 16]
21834 pmulhrsw m5, [pw_1024]
21835 packuswb m3, m5
21836 movu [r0 + 760 * 16], m3
21837 pslldq m1, 2
21838 pinsrw m1, [r4 + 8], 0
21839 pmaddubsw m3, m1, [r5 + 27 * 16]
21840 pmulhrsw m3, [pw_1024]
21841 pslldq m4, 2
21842 pinsrw m4, [r4 + 16], 0
21843 pmaddubsw m5, m4, [r5 + 27 * 16]
21844 pmulhrsw m5, [pw_1024]
21845 packuswb m3, m5
21846 movu [r0 + 761 * 16], m3
21847
21848 ; mode 13 [row 29]
21849 pmaddubsw m3, m6, [r5 + 18 * 16]
21850 pmulhrsw m3, [pw_1024]
21851 pmaddubsw m5, m2, [r5 + 18 * 16]
21852 pmulhrsw m5, [pw_1024]
21853 packuswb m3, m5
21854 movu [r0 + 762 * 16], m3
21855 pmaddubsw m3, m1, [r5 + 18 * 16]
21856 pmulhrsw m3, [pw_1024]
21857 pmaddubsw m5, m4, [r5 + 18 * 16]
21858 pmulhrsw m5, [pw_1024]
21859 packuswb m3, m5
21860 movu [r0 + 763 * 16], m3
21861
21862 ; mode 13 [row 30]
21863 pmaddubsw m3, m6, [r5 + 9 * 16]
21864 pmulhrsw m3, [pw_1024]
21865 pmaddubsw m5, m2, [r5 + 9 * 16]
21866 pmulhrsw m5, [pw_1024]
21867 packuswb m3, m5
21868 movu [r0 + 764 * 16], m3
21869 pmaddubsw m3, m1, [r5 + 9 * 16]
21870 pmulhrsw m3, [pw_1024]
21871 pmaddubsw m5, m4, [r5 + 9 * 16]
21872 pmulhrsw m5, [pw_1024]
21873 packuswb m3, m5
21874 movu [r0 + 765 * 16], m3
21875
21876 ; mode 14 [row 19]
21877 pslldq m7, 2
21878 pinsrb m7, [r3 + 17], 1
21879 pinsrb m7, [r3 + 20], 0
21880 pmaddubsw m3, m7, [r5 + 28 * 16]
21881 pmulhrsw m3, [pw_1024]
21882 pmaddubsw m5, m2, [r5 + 28 * 16]
21883 pmulhrsw m5, [pw_1024]
21884 packuswb m3, m5
21885 movu [r0 + 806 * 16], m3
21886 pmaddubsw m3, m1, [r5 + 28 * 16]
21887 pmulhrsw m3, [pw_1024]
21888 pmaddubsw m5, m4, [r5 + 28 * 16]
21889 pmulhrsw m5, [pw_1024]
21890 packuswb m3, m5
21891 movu [r0 + 807 * 16], m3
21892
21893 ; mode 14 [row 20]
21894 pmaddubsw m3, m7, [r5 + 15 * 16]
21895 pmulhrsw m3, [pw_1024]
21896 pmaddubsw m5, m2, [r5 + 15 * 16]
21897 pmulhrsw m5, [pw_1024]
21898 packuswb m3, m5
21899 movu [r0 + 808 * 16], m3
21900 pmaddubsw m3, m1, [r5 + 15 * 16]
21901 pmulhrsw m3, [pw_1024]
21902 pmaddubsw m5, m4, [r5 + 15 * 16]
21903 pmulhrsw m5, [pw_1024]
21904 packuswb m3, m5
21905 movu [r0 + 809 * 16], m3
21906
21907 ; mode 14 [row 21]
21908 pmaddubsw m3, m7, [r5 + 2 * 16]
21909 pmulhrsw m3, [pw_1024]
21910 pmaddubsw m5, m2, [r5 + 2 * 16]
21911 pmulhrsw m5, [pw_1024]
21912 packuswb m3, m5
21913 movu [r0 + 810 * 16], m3
21914 pmaddubsw m3, m1, [r5 + 2 * 16]
21915 pmulhrsw m3, [pw_1024]
21916 pmaddubsw m5, m4, [r5 + 2 * 16]
21917 pmulhrsw m5, [pw_1024]
21918 packuswb m3, m5
21919 movu [r0 + 811 * 16], m3
21920
21921 ; mode 15 [row 15]
21922 pmaddubsw m5, m2, [r5 + 16 * 16]
21923 pmulhrsw m5, [pw_1024]
21924 packuswb m5, m5
21925 movh [r0 + 862 * 16 + 8], m5
21926 pmaddubsw m3, m1, [r5 + 16 * 16]
21927 pmulhrsw m3, [pw_1024]
21928 pmaddubsw m5, m4, [r5 + 16 * 16]
21929 pmulhrsw m5, [pw_1024]
21930 packuswb m3, m5
21931 movu [r0 + 863 * 16], m3
21932
21933 ; mode 14 [row 22]
21934 pslldq m7, 2
21935 pinsrb m7, [r3 + 20], 1
21936 pinsrb m7, [r3 + 22], 0
21937 pmaddubsw m3, m7, [r5 + 21 * 16]
21938 pmulhrsw m3, [pw_1024]
21939 pslldq m2, 2
21940 pinsrb m2, [r4 + 0], 1
21941 pinsrb m2, [r3 + 2], 0
21942 pmaddubsw m5, m2, [r5 + 21 * 16]
21943 pmulhrsw m5, [pw_1024]
21944 packuswb m3, m5
21945 movu [r0 + 812 * 16], m3
21946 pslldq m1, 2
21947 pinsrw m1, [r4 + 7], 0
21948 pmaddubsw m3, m1, [r5 + 21 * 16]
21949 pmulhrsw m3, [pw_1024]
21950 pslldq m4, 2
21951 pinsrw m4, [r4 + 15], 0
21952 pmaddubsw m5, m4, [r5 + 21 * 16]
21953 pmulhrsw m5, [pw_1024]
21954 packuswb m3, m5
21955 movu [r0 + 813 * 16], m3
21956
21957 ; mode 14 [row 23]
21958 pmaddubsw m3, m7, [r5 + 8 * 16]
21959 pmulhrsw m3, [pw_1024]
21960 pmaddubsw m5, m2, [r5 + 8 * 16]
21961 pmulhrsw m5, [pw_1024]
21962 packuswb m3, m5
21963 movu [r0 + 814 * 16], m3
21964 pmaddubsw m3, m1, [r5 + 8 * 16]
21965 pmulhrsw m3, [pw_1024]
21966 pmaddubsw m5, m4, [r5 + 8 * 16]
21967 pmulhrsw m5, [pw_1024]
21968 packuswb m3, m5
21969 movu [r0 + 815 * 16], m3
21970
21971 ; mode 15 [row 16]
21972 pmaddubsw m5, m2, [r5 + 31 * 16]
21973 pmulhrsw m5, [pw_1024]
21974 packuswb m5, m5
21975 movh [r0 + 864 * 16 + 8], m5
21976 pmaddubsw m3, m1, [r5 + 31 * 16]
21977 pmulhrsw m3, [pw_1024]
21978 pmaddubsw m5, m4, [r5 + 31 * 16]
21979 pmulhrsw m5, [pw_1024]
21980 packuswb m3, m5
21981 movu [r0 + 865 * 16], m3
21982
21983 ; mode 15 [row 17]
21984 pmaddubsw m5, m2, [r5 + 14 * 16]
21985 pmulhrsw m5, [pw_1024]
21986 packuswb m5, m5
21987 movh [r0 + 866 * 16 + 8], m5
21988 pmaddubsw m3, m1, [r5 + 14 * 16]
21989 pmulhrsw m3, [pw_1024]
21990 pmaddubsw m5, m4, [r5 + 14 * 16]
21991 pmulhrsw m5, [pw_1024]
21992 packuswb m3, m5
21993 movu [r0 + 867 * 16], m3
21994
21995 ; mode 14 [row 24]
21996 pslldq m7, 2
21997 pinsrb m7, [r3 + 22], 1
21998 pinsrb m7, [r3 + 25], 0
21999 pmaddubsw m3, m7, [r5 + 27 * 16]
22000 pmulhrsw m3, [pw_1024]
22001 pslldq m2, 2
22002 pinsrb m2, [r3 + 2], 1
22003 pinsrb m2, [r3 + 5], 0
22004 pmaddubsw m5, m2, [r5 + 27 * 16]
22005 pmulhrsw m5, [pw_1024]
22006 packuswb m3, m5
22007 movu [r0 + 816 * 16], m3
22008 pslldq m1, 2
22009 pinsrw m1, [r4 + 6], 0
22010 pmaddubsw m3, m1, [r5 + 27 * 16]
22011 pmulhrsw m3, [pw_1024]
22012 pslldq m4, 2
22013 pinsrw m4, [r4 + 14], 0
22014 pmaddubsw m5, m4, [r5 + 27 * 16]
22015 pmulhrsw m5, [pw_1024]
22016 packuswb m3, m5
22017 movu [r0 + 817 * 16], m3
22018
22019 ; mode 14 [row 25]
22020 pmaddubsw m3, m7, [r5 + 14 * 16]
22021 pmulhrsw m3, [pw_1024]
22022 pmaddubsw m5, m2, [r5 + 14 * 16]
22023 pmulhrsw m5, [pw_1024]
22024 packuswb m3, m5
22025 movu [r0 + 818 * 16], m3
22026 pmaddubsw m3, m1, [r5 + 14 * 16]
22027 pmulhrsw m3, [pw_1024]
22028 pmaddubsw m5, m4, [r5 + 14 * 16]
22029 pmulhrsw m5, [pw_1024]
22030 packuswb m3, m5
22031 movu [r0 + 819 * 16], m3
22032
22033 ; mode 14 [row 26]
22034 pmaddubsw m3, m7, [r5 + 1 * 16]
22035 pmulhrsw m3, [pw_1024]
22036 pmaddubsw m5, m2, [r5 + 1 * 16]
22037 pmulhrsw m5, [pw_1024]
22038 packuswb m3, m5
22039 movu [r0 + 820 * 16], m3
22040 pmaddubsw m3, m1, [r5 + 1 * 16]
22041 pmulhrsw m3, [pw_1024]
22042 pmaddubsw m5, m4, [r5 + 1 * 16]
22043 pmulhrsw m5, [pw_1024]
22044 packuswb m3, m5
22045 movu [r0 + 821 * 16], m3
22046
22047 ; mode 15 [row 18]
22048 pinsrb m2, [r3 + 4], 0
22049 pmaddubsw m5, m2, [r5 + 29 * 16]
22050 pmulhrsw m5, [pw_1024]
22051 packuswb m5, m5
22052 movh [r0 + 868 * 16 + 8], m5
22053 pmaddubsw m3, m1, [r5 + 29 * 16]
22054 pmulhrsw m3, [pw_1024]
22055 pmaddubsw m5, m4, [r5 + 29 * 16]
22056 pmulhrsw m5, [pw_1024]
22057 packuswb m3, m5
22058 movu [r0 + 869 * 16], m3
22059
22060 ; mode 15 [row 19]
22061 pmaddubsw m5, m2, [r5 + 12 * 16]
22062 pmulhrsw m5, [pw_1024]
22063 packuswb m5, m5
22064 movh [r0 + 870 * 16 + 8], m5
22065 pmaddubsw m3, m1, [r5 + 12 * 16]
22066 pmulhrsw m3, [pw_1024]
22067 pmaddubsw m5, m4, [r5 + 12 * 16]
22068 pmulhrsw m5, [pw_1024]
22069 packuswb m3, m5
22070 movu [r0 + 871 * 16], m3
22071
22072 ; mode 15 [row 20 - 8 to 15]
22073 pslldq m3, m2, 2
22074 pinsrb m3, [r3 + 4], 1
22075 pinsrb m3, [r3 + 6], 0
22076 pmaddubsw m5, m3, [r5 + 27 * 16]
22077 pmulhrsw m5, [pw_1024]
22078 packuswb m5, m5
22079 movh [r0 + 872 * 16 + 8], m5
22080
22081 ; mode 15 [row 21 - 8 to 15]
22082 pmaddubsw m5, m3, [r5 + 10 * 16]
22083 pmulhrsw m5, [pw_1024]
22084 packuswb m5, m5
22085 movh [r0 + 874 * 16 + 8], m5
22086
22087 ; mode 15 [row 22 - 8 to 15]
22088 pslldq m3, 2
22089 pinsrb m3, [r3 + 6], 1
22090 pinsrb m3, [r3 + 8], 0
22091 pmaddubsw m5, m3, [r5 + 25 * 16]
22092 pmulhrsw m5, [pw_1024]
22093 packuswb m5, m5
22094 movh [r0 + 876 * 16 + 8], m5
22095
22096 ; mode 15 [row 23 - 8 to 15]
22097 pmaddubsw m5, m3, [r5 + 8 * 16]
22098 pmulhrsw m5, [pw_1024]
22099 packuswb m5, m5
22100 movh [r0 + 878 * 16 + 8], m5
22101
22102 ; mode 15 [row 24 - 8 to 15]
22103 pslldq m3, 2
22104 pinsrb m3, [r3 + 8], 1
22105 pinsrb m3, [r3 + 9], 0
22106 pmaddubsw m5, m3, [r5 + 23 * 16]
22107 pmulhrsw m5, [pw_1024]
22108 packuswb m5, m5
22109 movh [r0 + 880 * 16 + 8], m5
22110
22111 ; mode 15 [row 25 - 8 to 15]
22112 pmaddubsw m5, m3, [r5 + 6 * 16]
22113 pmulhrsw m5, [pw_1024]
22114 packuswb m5, m5
22115 movh [r0 + 882 * 16 + 8], m5
22116
22117 ; mode 15 [row 26 - 8 to 15]
22118 pslldq m3, 2
22119 pinsrb m3, [r3 + 9], 1
22120 pinsrb m3, [r3 + 11], 0
22121 pmaddubsw m5, m3, [r5 + 21 * 16]
22122 pmulhrsw m5, [pw_1024]
22123 packuswb m5, m5
22124 movh [r0 + 884 * 16 + 8], m5
22125
22126 ; mode 15 [row 27 - 8 to 15]
22127 pmaddubsw m5, m3, [r5 + 4 * 16]
22128 pmulhrsw m5, [pw_1024]
22129 packuswb m5, m5
22130 movh [r0 + 886 * 16 + 8], m5
22131
22132 ; mode 15 [row 28 - 8 to 15]
22133 pslldq m3, 2
22134 pinsrb m3, [r3 + 11], 1
22135 pinsrb m3, [r3 + 13], 0
22136 pmaddubsw m5, m3, [r5 + 19 * 16]
22137 pmulhrsw m5, [pw_1024]
22138 packuswb m5, m5
22139 movh [r0 + 888 * 16 + 8], m5
22140
22141 ; mode 15 [row 29 - 8 to 15]
22142 pmaddubsw m5, m3, [r5 + 2 * 16]
22143 pmulhrsw m5, [pw_1024]
22144 packuswb m5, m5
22145 movh [r0 + 890 * 16 + 8], m5
22146
22147 ; mode 15 [row 30 - 8 to 15]
22148 pslldq m3, 2
22149 pinsrb m3, [r3 + 13], 1
22150 pinsrb m3, [r3 + 15], 0
22151 pmaddubsw m5, m3, [r5 + 17 * 16]
22152 pmulhrsw m5, [pw_1024]
22153 packuswb m5, m5
22154 movh [r0 + 892 * 16 + 8], m5
22155
22156 ; mode 15 [row 31, 8 to 15]
22157 pshufb m5, m3, [tab_S2]
22158 movh [r0 + 894 * 16 + 8], m5
22159
22160 ; mode 14 [row 27]
22161 pinsrb m2, [r3 + 5], 0
22162 pslldq m7, 2
22163 pinsrb m7, [r3 + 25], 1
22164 pinsrb m7, [r3 + 27], 0
22165 pmaddubsw m3, m7, [r5 + 20 * 16]
22166 pmulhrsw m3, [pw_1024]
22167 pslldq m2, 2
22168 pinsrb m2, [r3 + 5], 1
22169 pinsrb m2, [r3 + 7], 0
22170 pmaddubsw m5, m2, [r5 + 20 * 16]
22171 pmulhrsw m5, [pw_1024]
22172 packuswb m3, m5
22173 movu [r0 + 822 * 16], m3
22174 pslldq m1, 2
22175 pinsrw m1, [r4 + 5], 0
22176 pmaddubsw m3, m1, [r5 + 20 * 16]
22177 pmulhrsw m3, [pw_1024]
22178 pslldq m4, 2
22179 pinsrw m4, [r4 + 13], 0
22180 pmaddubsw m5, m4, [r5 + 20 * 16]
22181 pmulhrsw m5, [pw_1024]
22182 packuswb m3, m5
22183 movu [r0 + 823 * 16], m3
22184
22185 ; mode 15 [row 20 - 16 to 31]
22186 pmaddubsw m3, m1, [r5 + 27 * 16]
22187 pmulhrsw m3, [pw_1024]
22188 pmaddubsw m5, m4, [r5 + 27 * 16]
22189 pmulhrsw m5, [pw_1024]
22190 packuswb m3, m5
22191 movu [r0 + 873 * 16], m3
22192
22193 ; mode 15 [row 21 - 16 to 31]
22194 pmaddubsw m3, m1, [r5 + 10 * 16]
22195 pmulhrsw m3, [pw_1024]
22196 pmaddubsw m5, m4, [r5 + 10 * 16]
22197 pmulhrsw m5, [pw_1024]
22198 packuswb m3, m5
22199 movu [r0 + 875 * 16], m3
22200
22201 ; mode 14 [row 28]
22202 pmaddubsw m3, m7, [r5 + 7 * 16]
22203 pmulhrsw m3, [pw_1024]
22204 pmaddubsw m5, m2, [r5 + 7 * 16]
22205 pmulhrsw m5, [pw_1024]
22206 packuswb m3, m5
22207 movu [r0 + 824 * 16], m3
22208 pmaddubsw m3, m1, [r5 + 7 * 16]
22209 pmulhrsw m3, [pw_1024]
22210 pmaddubsw m5, m4, [r5 + 7 * 16]
22211 pmulhrsw m5, [pw_1024]
22212 packuswb m3, m5
22213 movu [r0 + 825 * 16], m3
22214
22215 ; mode 14 [row 29]
22216 pslldq m7, 2
22217 pinsrb m7, [r3 + 27], 1
22218 pinsrb m7, [r3 + 30], 0
22219 pmaddubsw m3, m7, [r5 + 26 * 16]
22220 pmulhrsw m3, [pw_1024]
22221 pslldq m2, 2
22222 pinsrb m2, [r3 + 7], 1
22223 pinsrb m2, [r3 + 10], 0
22224 pmaddubsw m5, m2, [r5 + 26 * 16]
22225 pmulhrsw m5, [pw_1024]
22226 packuswb m3, m5
22227 movu [r0 + 826 * 16], m3
22228 pslldq m1, 2
22229 pinsrw m1, [r4 + 4], 0
22230 pmaddubsw m3, m1, [r5 + 26 * 16]
22231 pmulhrsw m3, [pw_1024]
22232 pslldq m4, 2
22233 pinsrw m4, [r4 + 12], 0
22234 pmaddubsw m5, m4, [r5 + 26 * 16]
22235 pmulhrsw m5, [pw_1024]
22236 packuswb m3, m5
22237 movu [r0 + 827 * 16], m3
22238
22239 ; mode 14 [row 30]
22240 pmaddubsw m3, m7, [r5 + 13 * 16]
22241 pmulhrsw m3, [pw_1024]
22242 pmaddubsw m5, m2, [r5 + 13 * 16]
22243 pmulhrsw m5, [pw_1024]
22244 packuswb m3, m5
22245 movu [r0 + 828 * 16], m3
22246 pmaddubsw m3, m1, [r5 + 13 * 16]
22247 pmulhrsw m3, [pw_1024]
22248 pmaddubsw m5, m4, [r5 + 13 * 16]
22249 pmulhrsw m5, [pw_1024]
22250 packuswb m3, m5
22251 movu [r0 + 829 * 16], m3
22252
22253 ; mode 15 [row 22]
22254 pmaddubsw m3, m1, [r5 + 25 * 16]
22255 pmulhrsw m3, [pw_1024]
22256 pmaddubsw m5, m4, [r5 + 25 * 16]
22257 pmulhrsw m5, [pw_1024]
22258 packuswb m3, m5
22259 movu [r0 + 877 * 16], m3
22260
22261 ; mode 15 [row 23]
22262 pmaddubsw m3, m1, [r5 + 8 * 16]
22263 pmulhrsw m3, [pw_1024]
22264 pmaddubsw m5, m4, [r5 + 8 * 16]
22265 pmulhrsw m5, [pw_1024]
22266 packuswb m3, m5
22267 movu [r0 + 879 * 16], m3
22268
22269 ; mode 14 [row 31]
22270 pshufb m3, m7, [tab_S2]
22271 movh [r0 + 830 * 16], m3
22272 pshufb m3, m2, [tab_S2]
22273 movh [r0 + 830 * 16 + 8], m3
22274 pshufb m3, m1, [tab_S2]
22275 movh [r0 + 831 * 16], m3
22276 pshufb m3, m4, [tab_S2]
22277 movh [r0 + 831 * 16 + 8], m3
22278
22279 ; mode 13 [row 31]
22280 pshufb m0, m6, [tab_S2]
22281 movh [r0 + 766 * 16], m0
22282 movh m0, [r4]
22283 movh [r0 + 766 * 16 + 8], m0
22284 movu m0, [r4 + 8]
22285 movu [r0 + 767 * 16], m0
22286
22287 ; mode 15 [row 24]
22288 pslldq m1, 2
22289 pinsrw m1, [r4 + 3], 0
22290 pmaddubsw m3, m1, [r5 + 23 * 16]
22291 pmulhrsw m3, [pw_1024]
22292 pslldq m4, 2
22293 pinsrw m4, [r4 + 11], 0
22294 pmaddubsw m5, m4, [r5 + 23 * 16]
22295 pmulhrsw m5, [pw_1024]
22296 packuswb m3, m5
22297 movu [r0 + 881 * 16], m3
22298
22299 ; mode 15 [row 25]
22300 pmaddubsw m3, m1, [r5 + 6 * 16]
22301 pmulhrsw m3, [pw_1024]
22302 pmaddubsw m5, m4, [r5 + 6 * 16]
22303 pmulhrsw m5, [pw_1024]
22304 packuswb m3, m5
22305 movu [r0 + 883 * 16], m3
22306
22307 ; mode 15 [row 26]
22308 pslldq m1, 2
22309 pinsrw m1, [r4 + 2], 0
22310 pmaddubsw m3, m1, [r5 + 21 * 16]
22311 pmulhrsw m3, [pw_1024]
22312 pslldq m4, 2
22313 pinsrw m4, [r4 + 10], 0
22314 pmaddubsw m5, m4, [r5 + 21 * 16]
22315 pmulhrsw m5, [pw_1024]
22316 packuswb m3, m5
22317 movu [r0 + 885 * 16], m3
22318
22319 ; mode 15 [row 27]
22320 pmaddubsw m3, m1, [r5 + 4 * 16]
22321 pmulhrsw m3, [pw_1024]
22322 pmaddubsw m5, m4, [r5 + 4 * 16]
22323 pmulhrsw m5, [pw_1024]
22324 packuswb m3, m5
22325 movu [r0 + 887 * 16], m3
22326
22327 ; mode 15 [row 28]
22328 pslldq m1, 2
22329 pinsrw m1, [r4 + 1], 0
22330 pmaddubsw m3, m1, [r5 + 19 * 16]
22331 pmulhrsw m3, [pw_1024]
22332 pslldq m4, 2
22333 pinsrw m4, [r4 + 9], 0
22334 pmaddubsw m5, m4, [r5 + 19 * 16]
22335 pmulhrsw m5, [pw_1024]
22336 packuswb m3, m5
22337 movu [r0 + 889 * 16], m3
22338
22339 ; mode 15 [row 29]
22340 pmaddubsw m3, m1, [r5 + 2 * 16]
22341 pmulhrsw m3, [pw_1024]
22342 pmaddubsw m5, m4, [r5 + 2 * 16]
22343 pmulhrsw m5, [pw_1024]
22344 packuswb m3, m5
22345 movu [r0 + 891 * 16], m3
22346
22347 ; mode 15 [row 30]
22348 pslldq m1, 2
22349 pinsrw m1, [r4 + 0], 0
22350 pmaddubsw m3, m1, [r5 + 17 * 16]
22351 pmulhrsw m3, [pw_1024]
22352 pslldq m4, 2
22353 pinsrw m4, [r4 + 8], 0
22354 pmaddubsw m5, m4, [r5 + 17 * 16]
22355 pmulhrsw m5, [pw_1024]
22356 packuswb m3, m5
22357 movu [r0 + 893 * 16], m3
22358
22359 ; mode 15 [row 31]
22360 pshufb m5, m1, [tab_S2]
22361 movh [r0 + 895 * 16], m5
22362 pshufb m5, m4, [tab_S2]
22363 movh [r0 + 895 * 16 + 8], m5
22364
22365 ; mode 16 [row 0]
22366 movu m6, [r5 + 11 * 16]
22367 movu m7, [pw_1024]
22368 movh m0, [r4 ]
22369 movh m1, [r4 + 1 ]
22370 punpcklbw m0, m1
22371 pmaddubsw m1, m0, m6
22372 pmulhrsw m1, m7
22373 movh m2, [r4 + 8]
22374 movh m3, [r4 + 9]
22375 punpcklbw m2, m3
22376 pmaddubsw m3, m2, m6
22377 pmulhrsw m3, m7
22378 packuswb m1, m3
22379 movu [r0 + 896 * 16], m1
22380
22381 movh m1, [r4 + 16]
22382 movh m3, [r4 + 17]
22383 punpcklbw m1, m3
22384 pmaddubsw m3, m1, m6
22385 pmulhrsw m3, m7
22386 movh m4, [r4 + 24]
22387 movh m5, [r4 + 25]
22388 punpcklbw m4, m5
22389 pmaddubsw m5, m4, m6
22390 pmulhrsw m5, m7
22391 packuswb m3, m5
22392 movu [r0 + 897 * 16], m3
22393
22394 ; mode16 [row 1]
22395 movu m6, [r5 + 22 * 16]
22396 pslldq m0, 2
22397 pinsrb m0, [r4], 1
22398 pinsrb m0, [r3 + 2], 0
22399 pmaddubsw m3, m0, m6
22400 pmulhrsw m3, m7
22401 pslldq m2, 2
22402 pinsrw m2, [r4 + 7], 0
22403 pmaddubsw m5, m2, m6
22404 pmulhrsw m5, m7
22405 packuswb m3, m5
22406 movu [r0 + 898 * 16], m3
22407
22408 pslldq m1, 2
22409 pinsrw m1, [r4 + 15], 0
22410 pmaddubsw m3, m1, m6
22411 pmulhrsw m3, m7
22412 pslldq m4, 2
22413 pinsrw m4, [r4 + 23], 0
22414 pmaddubsw m5, m4, m6
22415 pmulhrsw m5, m7
22416 packuswb m3, m5
22417 movu [r0 + 899 * 16], m3
22418
22419 ; mode16 [row 2]
22420 movu m6, [r5 + 1 * 16]
22421 pmaddubsw m3, m0, m6
22422 pmulhrsw m3, m7
22423 pmaddubsw m5, m2, m6
22424 pmulhrsw m5, m7
22425 packuswb m3, m5
22426 movu [r0 + 900 * 16], m3
22427
22428 pmaddubsw m3, m1, m6
22429 pmulhrsw m3, m7
22430 pmaddubsw m5, m4, m6
22431 pmulhrsw m5, m7
22432 packuswb m3, m5
22433 movu [r0 + 901 * 16], m3
22434
22435 ; mode16 [row 3]
22436 movu m6, [r5 + 12 * 16]
22437 pslldq m0, 2
22438 pinsrb m0, [r3 + 2], 1
22439 pinsrb m0, [r3 + 3], 0
22440 pmaddubsw m3, m0, m6
22441 pmulhrsw m3, m7
22442 pslldq m2, 2
22443 pinsrw m2, [r4 + 6], 0
22444 pmaddubsw m5, m2, m6
22445 pmulhrsw m5, m7
22446 packuswb m3, m5
22447 movu [r0 + 902 * 16], m3
22448
22449 pslldq m1, 2
22450 pinsrw m1, [r4 + 14], 0
22451 pmaddubsw m3, m1, m6
22452 pmulhrsw m3, m7
22453 pslldq m4, 2
22454 pinsrw m4, [r4 + 22], 0
22455 pmaddubsw m5, m4, m6
22456 pmulhrsw m5, m7
22457 packuswb m3, m5
22458 movu [r0 + 903 * 16], m3
22459
22460 ; mode16 [row 4]
22461 movu m6, [r5 + 23 * 16]
22462 pslldq m0, 2
22463 pinsrb m0, [r3 + 3], 1
22464 pinsrb m0, [r3 + 5], 0
22465 pmaddubsw m3, m0, m6
22466 pmulhrsw m3, m7
22467 pslldq m2, 2
22468 pinsrw m2, [r4 + 5], 0
22469 pmaddubsw m5, m2, m6
22470 pmulhrsw m5, m7
22471 packuswb m3, m5
22472 movu [r0 + 904 * 16], m3
22473
22474 pslldq m1, 2
22475 pinsrw m1, [r4 + 13], 0
22476 pmaddubsw m3, m1, m6
22477 pmulhrsw m3, m7
22478 pslldq m4, 2
22479 pinsrw m4, [r4 + 21], 0
22480 pmaddubsw m5, m4, m6
22481 pmulhrsw m5, m7
22482 packuswb m3, m5
22483 movu [r0 + 905 * 16], m3
22484
22485 ; mode16 [row 5]
22486 movu m6, [r5 + 2 * 16]
22487 pmaddubsw m3, m0, m6
22488 pmulhrsw m3, m7
22489 pmaddubsw m5, m2, m6
22490 pmulhrsw m5, m7
22491 packuswb m3, m5
22492 movu [r0 + 906 * 16], m3
22493
22494 pmaddubsw m3, m1, m6
22495 pmulhrsw m3, m7
22496 pmaddubsw m5, m4, m6
22497 pmulhrsw m5, m7
22498 packuswb m3, m5
22499 movu [r0 + 907 * 16], m3
22500
22501 ; mode16 [row 6]
22502 movu m6, [r5 + 13 * 16]
22503 pslldq m0, 2
22504 pinsrb m0, [r3 + 5], 1
22505 pinsrb m0, [r3 + 6], 0
22506 pmaddubsw m3, m0, m6
22507 pmulhrsw m3, m7
22508 pslldq m2, 2
22509 pinsrb m2, [r4 + 5], 1
22510 pinsrb m2, [r4 + 4], 0
22511 pmaddubsw m5, m2, m6
22512 pmulhrsw m5, m7
22513 packuswb m3, m5
22514 movu [r0 + 908 * 16], m3
22515 pslldq m1, 2
22516 pinsrw m1, [r4 + 12], 0
22517 pmaddubsw m3, m1, m6
22518 pmulhrsw m3, m7
22519 pslldq m4, 2
22520 pinsrw m4, [r4 + 20], 0
22521 pmaddubsw m5, m4, m6
22522 pmulhrsw m5, m7
22523 packuswb m3, m5
22524 movu [r0 + 909 * 16], m3
22525
22526 ; mode16 [row 7]
22527 movu m6, [r5 + 24 * 16]
22528 pslldq m0, 2
22529 pinsrb m0, [r3 + 6], 1
22530 pinsrb m0, [r3 + 8], 0
22531 pmaddubsw m3, m0, m6
22532 pmulhrsw m3, m7
22533 pslldq m2, 2
22534 pinsrw m2, [r4 + 3], 0
22535 pmaddubsw m5, m2, m6
22536 pmulhrsw m5, m7
22537 packuswb m3, m5
22538 movu [r0 + 910 * 16], m3
22539
22540 pslldq m1, 2
22541 pinsrw m1, [r4 + 11], 0
22542 pmaddubsw m3, m1, m6
22543 pmulhrsw m3, m7
22544 pslldq m4, 2
22545 pinsrw m4, [r4 + 19], 0
22546 pmaddubsw m5, m4, m6
22547 pmulhrsw m5, m7
22548 packuswb m3, m5
22549 movu [r0 + 911 * 16], m3
22550
22551 ; mode16 [row 8]
22552 movu m6, [r5 + 3 * 16]
22553 pmaddubsw m3, m0, m6
22554 pmulhrsw m3, m7
22555 pmaddubsw m5, m2, m6
22556 pmulhrsw m5, m7
22557 packuswb m3, m5
22558 movu [r0 + 912 * 16], m3
22559
22560 pmaddubsw m3, m1, m6
22561 pmulhrsw m3, m7
22562 pmaddubsw m5, m4, m6
22563 pmulhrsw m5, m7
22564 packuswb m3, m5
22565 movu [r0 + 913 * 16], m3
22566
22567 ; mode16 [row 9]
22568 movu m6, [r5 + 14 * 16]
22569 pslldq m0, 2
22570 pinsrb m0, [r3 + 8], 1
22571 pinsrb m0, [r3 + 9], 0
22572 pmaddubsw m3, m0, m6
22573 pmulhrsw m3, m7
22574 pslldq m2, 2
22575 pinsrw m2, [r4 + 2], 0
22576 pmaddubsw m5, m2, m6
22577 pmulhrsw m5, m7
22578 packuswb m3, m5
22579 movu [r0 + 914 * 16], m3
22580
22581 pslldq m1, 2
22582 pinsrw m1, [r4 + 10], 0
22583 pmaddubsw m3, m1, m6
22584 pmulhrsw m3, m7
22585 pslldq m4, 2
22586 pinsrw m4, [r4 + 18], 0
22587 pmaddubsw m5, m4, m6
22588 pmulhrsw m5, m7
22589 packuswb m3, m5
22590 movu [r0 + 915 * 16], m3
22591
22592 ; mode16 [row 10]
22593 movu m6, [r5 + 25 * 16]
22594 pslldq m0, 2
22595 pinsrb m0, [r3 + 9], 1
22596 pinsrb m0, [r3 + 11], 0
22597 pmaddubsw m3, m0, m6
22598 pmulhrsw m3, m7
22599 pslldq m2, 2
22600 pinsrw m2, [r4 + 1], 0
22601 pmaddubsw m5, m2, m6
22602 pmulhrsw m5, m7
22603 packuswb m3, m5
22604 movu [r0 + 916 * 16], m3
22605
22606 pslldq m1, 2
22607 pinsrw m1, [r4 + 9], 0
22608 pmaddubsw m3, m1, m6
22609 pmulhrsw m3, m7
22610 pslldq m4, 2
22611 pinsrb m4, [r4 + 18], 1
22612 pinsrb m4, [r4 + 17], 0
22613 pmaddubsw m5, m4, m6
22614 pmulhrsw m5, m7
22615 packuswb m3, m5
22616 movu [r0 + 917 * 16], m3
22617
22618 ; mode16 [row 11]
22619 movu m6, [r5 + 4 * 16]
22620 pmaddubsw m3, m0, m6
22621 pmulhrsw m3, m7
22622 pmaddubsw m5, m2, m6
22623 pmulhrsw m5, m7
22624 packuswb m3, m5
22625 movu [r0 + 918 * 16], m3
22626
22627 pmaddubsw m3, m1, m6
22628 pmulhrsw m3, m7
22629 pmaddubsw m5, m4, m6
22630 pmulhrsw m5, m7
22631 packuswb m3, m5
22632 movu [r0 + 919 * 16], m3
22633
22634 ; mode16 [row 12]
22635 movu m6, [r5 + 15 * 16]
22636 pslldq m0, 2
22637 pinsrb m0, [r3 + 11], 1
22638 pinsrb m0, [r3 + 12], 0
22639 pmaddubsw m3, m0, m6
22640 pmulhrsw m3, m7
22641 pslldq m2, 2
22642 pinsrw m2, [r4 + 0], 0
22643 pmaddubsw m5, m2, m6
22644 pmulhrsw m5, m7
22645 packuswb m3, m5
22646 movu [r0 + 920 * 16], m3
22647
22648 pslldq m1, 2
22649 pinsrw m1, [r4 + 8], 0
22650 pmaddubsw m3, m1, m6
22651 pmulhrsw m3, m7
22652 pslldq m4, 2
22653 pinsrw m4, [r4 + 16], 0
22654 pmaddubsw m5, m4, m6
22655 pmulhrsw m5, m7
22656 packuswb m3, m5
22657 movu [r0 + 921 * 16], m3
22658
22659 ; mode16 [row 13]
22660 movu m6, [r5 + 26 * 16]
22661 pslldq m0, 2
22662 pinsrb m0, [r3 + 12], 1
22663 pinsrb m0, [r3 + 14], 0
22664 pmaddubsw m3, m0, m6
22665 pmulhrsw m3, m7
22666 pslldq m2, 2
22667 pinsrb m2, [r4 + 0], 1
22668 pinsrb m2, [r3 + 2], 0
22669 pmaddubsw m5, m2, m6
22670 pmulhrsw m5, m7
22671 packuswb m3, m5
22672 movu [r0 + 922 * 16], m3
22673
22674 pslldq m1, 2
22675 pinsrw m1, [r4 + 7], 0
22676 pmaddubsw m3, m1, m6
22677 pmulhrsw m3, m7
22678 pslldq m4, 2
22679 pinsrw m4, [r4 + 15], 0
22680 pmaddubsw m5, m4, m6
22681 pmulhrsw m5, m7
22682 packuswb m3, m5
22683 movu [r0 + 923 * 16], m3
22684
22685 ; mode16 [row 14]
22686 movu m6, [r5 + 5 * 16]
22687 pmaddubsw m3, m0, m6
22688 pmulhrsw m3, m7
22689 pmaddubsw m5, m2, m6
22690 pmulhrsw m5, m7
22691 packuswb m3, m5
22692 movu [r0 + 924 * 16], m3
22693
22694 pmaddubsw m3, m1, m6
22695 pmulhrsw m3, m7
22696 pmaddubsw m5, m4, m6
22697 pmulhrsw m5, m7
22698 packuswb m3, m5
22699 movu [r0 + 925 * 16], m3
22700
22701 ; mode16 [row 15]
22702 movu m6, [r5 + 16 * 16]
22703 pslldq m0, 2
22704 pinsrb m0, [r3 + 14], 1
22705 pinsrb m0, [r3 + 15], 0
22706 pmaddubsw m3, m0, m6
22707 pmulhrsw m3, m7
22708 pslldq m2, 2
22709 pinsrb m2, [r3 + 2], 1
22710 pinsrb m2, [r3 + 3], 0
22711 pmaddubsw m5, m2, m6
22712 pmulhrsw m5, m7
22713 packuswb m3, m5
22714 movu [r0 + 926 * 16], m3
22715
22716 pslldq m1, 2
22717 pinsrw m1, [r4 + 6], 0
22718 pmaddubsw m3, m1, m6
22719 pmulhrsw m3, m7
22720 pslldq m4, 2
22721 pinsrw m4, [r4 + 14], 0
22722 pmaddubsw m5, m4, m6
22723 pmulhrsw m5, m7
22724 packuswb m3, m5
22725 movu [r0 + 927 * 16], m3
22726
22727 ; mode16 [row 16]
22728 movu m6, [r5 + 27 * 16]
22729 pslldq m0, 2
22730 pinsrb m0, [r3 + 15], 1
22731 pinsrb m0, [r3 + 17], 0
22732 pmaddubsw m3, m0, m6
22733 pmulhrsw m3, m7
22734 pslldq m2, 2
22735 pinsrb m2, [r3 + 3], 1
22736 pinsrb m2, [r3 + 5], 0
22737 pmaddubsw m5, m2, m6
22738 pmulhrsw m5, m7
22739 packuswb m3, m5
22740 movu [r0 + 928 * 16], m3
22741
22742 pslldq m1, 2
22743 pinsrw m1, [r4 + 5], 0
22744 pmaddubsw m3, m1, m6
22745 pmulhrsw m3, m7
22746 pslldq m4, 2
22747 pinsrw m4, [r4 + 13], 0
22748 pmaddubsw m5, m4, m6
22749 pmulhrsw m5, m7
22750 packuswb m3, m5
22751 movu [r0 + 929 * 16], m3
22752
22753 ; mode16 [row 17]
22754 movu m6, [r5 + 6 * 16]
22755 pmaddubsw m3, m0, m6
22756 pmulhrsw m3, m7
22757 pmaddubsw m5, m2, m6
22758 pmulhrsw m5, m7
22759 packuswb m3, m5
22760 movu [r0 + 930 * 16], m3
22761
22762 pmaddubsw m3, m1, m6
22763 pmulhrsw m3, m7
22764 pmaddubsw m5, m4, m6
22765 pmulhrsw m5, m7
22766 packuswb m3, m5
22767 movu [r0 + 931 * 16], m3
22768
22769 ; mode16 [row 18]
22770 movu m6, [r5 + 17 * 16]
22771 pslldq m0, 2
22772 pinsrb m0, [r3 + 17], 1
22773 pinsrb m0, [r3 + 18], 0
22774 pmaddubsw m3, m0, m6
22775 pmulhrsw m3, m7
22776 pslldq m2, 2
22777 pinsrb m2, [r3 + 5], 1
22778 pinsrb m2, [r3 + 6], 0
22779 pmaddubsw m5, m2, m6
22780 pmulhrsw m5, m7
22781 packuswb m3, m5
22782 movu [r0 + 932 * 16], m3
22783
22784 pslldq m1, 2
22785 pinsrw m1, [r4 + 4], 0
22786 pmaddubsw m3, m1, m6
22787 pmulhrsw m3, m7
22788 pslldq m4, 2
22789 pinsrw m4, [r4 + 12], 0
22790 pmaddubsw m5, m4, m6
22791 pmulhrsw m5, m7
22792 packuswb m3, m5
22793 movu [r0 + 933 * 16], m3
22794
22795 ; mode16 [row 19]
22796 movu m6, [r5 + 28 * 16]
22797 pslldq m0, 2
22798 pinsrb m0, [r3 + 18], 1
22799 pinsrb m0, [r3 + 20], 0
22800 pmaddubsw m3, m0, m6
22801 pmulhrsw m3, m7
22802 pslldq m2, 2
22803 pinsrb m2, [r3 + 6], 1
22804 pinsrb m2, [r3 + 8], 0
22805 pmaddubsw m5, m2, m6
22806 pmulhrsw m5, m7
22807 packuswb m3, m5
22808 movu [r0 + 934 * 16], m3
22809
22810 pslldq m1, 2
22811 pinsrw m1, [r4 + 3], 0
22812 pmaddubsw m3, m1, m6
22813 pmulhrsw m3, m7
22814 pslldq m4, 2
22815 pinsrw m4, [r4 + 11], 0
22816 pmaddubsw m5, m4, m6
22817 pmulhrsw m5, m7
22818 packuswb m3, m5
22819 movu [r0 + 935 * 16], m3
22820
22821 ; mode16 [row 20]
22822 movu m6, [r5 + 7 * 16]
22823 pmaddubsw m3, m0, m6
22824 pmulhrsw m3, m7
22825 pmaddubsw m5, m2, m6
22826 pmulhrsw m5, m7
22827 packuswb m3, m5
22828 movu [r0 + 936 * 16], m3
22829
22830 pmaddubsw m3, m1, m6
22831 pmulhrsw m3, m7
22832 pmaddubsw m5, m4, m6
22833 pmulhrsw m5, m7
22834 packuswb m3, m5
22835 movu [r0 + 937 * 16], m3
22836
22837 ; mode16 [row 21]
22838 movu m6, [r5 + 18 * 16]
22839 pslldq m0, 2
22840 pinsrb m0, [r3 + 20], 1
22841 pinsrb m0, [r3 + 21], 0
22842 pmaddubsw m3, m0, m6
22843 pmulhrsw m3, m7
22844 pslldq m2, 2
22845 pinsrb m2, [r3 + 8], 1
22846 pinsrb m2, [r3 + 9], 0
22847 pmaddubsw m5, m2, m6
22848 pmulhrsw m5, m7
22849 packuswb m3, m5
22850 movu [r0 + 938 * 16], m3
22851
22852 pslldq m1, 2
22853 pinsrw m1, [r4 + 2], 0
22854 pmaddubsw m3, m1, m6
22855 pmulhrsw m3, m7
22856 pslldq m4, 2
22857 pinsrw m4, [r4 + 10], 0
22858 pmaddubsw m5, m4, m6
22859 pmulhrsw m5, m7
22860 packuswb m3, m5
22861 movu [r0 + 939 * 16], m3
22862
22863 ; mode16 [row 22]
22864 movu m6, [r5 + 29 * 16]
22865 pslldq m0, 2
22866 pinsrb m0, [r3 + 21], 1
22867 pinsrb m0, [r3 + 23], 0
22868 pmaddubsw m3, m0, m6
22869 pmulhrsw m3, m7
22870 pslldq m2, 2
22871 pinsrb m2, [r3 + 9], 1
22872 pinsrb m2, [r3 + 11], 0
22873 pmaddubsw m5, m2, m6
22874 pmulhrsw m5, m7
22875 packuswb m3, m5
22876 movu [r0 + 940 * 16], m3
22877
22878 pslldq m1, 2
22879 pinsrw m1, [r4 + 1], 0
22880 pmaddubsw m3, m1, m6
22881 pmulhrsw m3, m7
22882 pslldq m4, 2
22883 pinsrw m4, [r4 + 9], 0
22884 pmaddubsw m5, m4, m6
22885 pmulhrsw m5, m7
22886 packuswb m3, m5
22887 movu [r0 + 941 * 16], m3
22888
22889 ; mode16 [row 23]
22890 movu m6, [r5 + 8 * 16]
22891 pmaddubsw m3, m0, m6
22892 pmulhrsw m3, m7
22893 pmaddubsw m5, m2, m6
22894 pmulhrsw m5, m7
22895 packuswb m3, m5
22896 movu [r0 + 942 * 16], m3
22897
22898 pmaddubsw m3, m1, m6
22899 pmulhrsw m3, m7
22900 pmaddubsw m5, m4, m6
22901 pmulhrsw m5, m7
22902 packuswb m3, m5
22903 movu [r0 + 943 * 16], m3
22904
22905 ; mode16 [row 24]
22906 movu m6, [r5 + 19 * 16]
22907 pslldq m0, 2
22908 pinsrb m0, [r3 + 23], 1
22909 pinsrb m0, [r3 + 24], 0
22910 pmaddubsw m3, m0, m6
22911 pmulhrsw m3, m7
22912 pslldq m2, 2
22913 pinsrb m2, [r3 + 11], 1
22914 pinsrb m2, [r3 + 12], 0
22915 pmaddubsw m5, m2, m6
22916 pmulhrsw m5, m7
22917 packuswb m3, m5
22918 movu [r0 + 944 * 16], m3
22919
22920 pslldq m1, 2
22921 pinsrw m1, [r4 + 0], 0
22922 pmaddubsw m3, m1, m6
22923 pmulhrsw m3, m7
22924 pslldq m4, 2
22925 pinsrw m4, [r4 + 8], 0
22926 pmaddubsw m5, m4, m6
22927 pmulhrsw m5, m7
22928 packuswb m3, m5
22929 movu [r0 + 945 * 16], m3
22930
22931 ; mode16 [row 25]
22932 movu m6, [r5 + 30 * 16]
22933 pslldq m0, 2
22934 pinsrb m0, [r3 + 24], 1
22935 pinsrb m0, [r3 + 26], 0
22936 pmaddubsw m3, m0, m6
22937 pmulhrsw m3, m7
22938 pslldq m2, 2
22939 pinsrb m2, [r3 + 12], 1
22940 pinsrb m2, [r3 + 14], 0
22941 pmaddubsw m5, m2, m6
22942 pmulhrsw m5, m7
22943 packuswb m3, m5
22944 movu [r0 + 946 * 16], m3
22945
22946 pslldq m1, 2
22947 pinsrb m1, [r4 + 0], 1
22948 pinsrb m1, [r3 + 2], 0
22949 pmaddubsw m3, m1, m6
22950 pmulhrsw m3, m7
22951 pslldq m4, 2
22952 pinsrw m4, [r4 + 7], 0
22953 pmaddubsw m5, m4, m6
22954 pmulhrsw m5, m7
22955 packuswb m3, m5
22956 movu [r0 + 947 * 16], m3
22957
22958 ; mode16 [row 26]
22959 movu m6, [r5 + 9 * 16]
22960 pmaddubsw m3, m0, m6
22961 pmulhrsw m3, m7
22962 pmaddubsw m5, m2, m6
22963 pmulhrsw m5, m7
22964 packuswb m3, m5
22965 movu [r0 + 948 * 16], m3
22966
22967 pmaddubsw m3, m1, m6
22968 pmulhrsw m3, m7
22969 pmaddubsw m5, m4, m6
22970 pmulhrsw m5, m7
22971 packuswb m3, m5
22972 movu [r0 + 949 * 16], m3
22973
22974 ; mode16 [row 27]
22975 movu m6, [r5 + 20 * 16]
22976 pslldq m0, 2
22977 pinsrb m0, [r3 + 26], 1
22978 pinsrb m0, [r3 + 27], 0
22979 pmaddubsw m3, m0, m6
22980 pmulhrsw m3, m7
22981 pslldq m2, 2
22982 pinsrb m2, [r3 + 14], 1
22983 pinsrb m2, [r3 + 15], 0
22984 pmaddubsw m5, m2, m6
22985 pmulhrsw m5, m7
22986 packuswb m3, m5
22987 movu [r0 + 950 * 16], m3
22988
22989 pslldq m1, 2
22990 pinsrb m1, [r3 + 2], 1
22991 pinsrb m1, [r3 + 3], 0
22992 pmaddubsw m3, m1, m6
22993 pmulhrsw m3, m7
22994 pslldq m4, 2
22995 pinsrw m4, [r4 + 6], 0
22996 pmaddubsw m5, m4, m6
22997 pmulhrsw m5, m7
22998 packuswb m3, m5
22999 movu [r0 + 951 * 16], m3
23000
23001 ; mode16 [row 28]
23002 movu m6, [r5 + 31 * 16]
23003 pslldq m0, 2
23004 pinsrb m0, [r3 + 27], 1
23005 pinsrb m0, [r3 + 29], 0
23006 pmaddubsw m3, m0, m6
23007 pmulhrsw m3, m7
23008 pslldq m2, 2
23009 pinsrb m2, [r3 + 15], 1
23010 pinsrb m2, [r3 + 17], 0
23011 pmaddubsw m5, m2, m6
23012 pmulhrsw m5, m7
23013 packuswb m3, m5
23014 movu [r0 + 952 * 16], m3
23015
23016 pslldq m1, 2
23017 pinsrb m1, [r3 + 3], 1
23018 pinsrb m1, [r3 + 5], 0
23019 pmaddubsw m3, m1, m6
23020 pmulhrsw m3, m7
23021 pslldq m4, 2
23022 pinsrw m4, [r4 + 5], 0
23023 pmaddubsw m5, m4, m6
23024 pmulhrsw m5, m7
23025 packuswb m3, m5
23026 movu [r0 + 953 * 16], m3
23027
23028 ; mode16 [row 29]
23029 movu m6, [r5 + 10 * 16]
23030 pmaddubsw m3, m0, m6
23031 pmulhrsw m3, m7
23032 pmaddubsw m5, m2, m6
23033 pmulhrsw m5, m7
23034 packuswb m3, m5
23035 movu [r0 + 954 * 16], m3
23036
23037 pmaddubsw m3, m1, m6
23038 pmulhrsw m3, m7
23039 pmaddubsw m5, m4, m6
23040 pmulhrsw m5, m7
23041 packuswb m3, m5
23042 movu [r0 + 955 * 16], m3
23043
23044 ; mode16 [row 30]
23045 movu m6, [r5 + 21 * 16]
23046 pslldq m0, 2
23047 pinsrb m0, [r3 + 29], 1
23048 pinsrb m0, [r3 + 30], 0
23049 pmaddubsw m3, m0, m6
23050 pmulhrsw m3, m7
23051 pslldq m2, 2
23052 pinsrb m2, [r3 + 17], 1
23053 pinsrb m2, [r3 + 18], 0
23054 pmaddubsw m5, m2, m6
23055 pmulhrsw m5, m7
23056 packuswb m3, m5
23057 movu [r0 + 956 * 16], m3
23058
23059 pslldq m1, 2
23060 pinsrb m1, [r3 + 5], 1
23061 pinsrb m1, [r3 + 6], 0
23062 pmaddubsw m3, m1, m6
23063 pmulhrsw m3, m7
23064 pslldq m4, 2
23065 pinsrw m4, [r4 + 4], 0
23066 pmaddubsw m5, m4, m6
23067 pmulhrsw m5, m7
23068 packuswb m3, m5
23069 movu [r0 + 957 * 16], m3
23070
23071 ; mode16 [row 31]
23072 pshufb m5, m0, [tab_S2]
23073 movh [r0 + 958 * 16], m5
23074 pshufb m5, m2, [tab_S2]
23075 movh [r0 + 958 * 16 + 8], m5
23076 pshufb m5, m1, [tab_S2]
23077 movh [r0 + 959 * 16], m5
23078 pshufb m5, m4, [tab_S2]
23079 movh [r0 + 959 * 16 + 8], m5
23080
23081 ; mode 17 [row 0]
23082 movu m6, [r5 + 6 * 16]
23083 movu m7, [pw_1024]
23084 movh m0, [r4 ]
23085 movh m1, [r4 + 1 ]
23086 punpcklbw m0, m1
23087 pmaddubsw m1, m0, m6
23088 pmulhrsw m1, m7
23089 movh m2, [r4 + 8]
23090 movh m3, [r4 + 9]
23091 punpcklbw m2, m3
23092 pmaddubsw m3, m2, m6
23093 pmulhrsw m3, m7
23094 packuswb m1, m3
23095 movu [r0 + 960 * 16], m1
23096
23097 movh m1, [r4 + 16]
23098 movh m3, [r4 + 17]
23099 punpcklbw m1, m3
23100 pmaddubsw m3, m1, m6
23101 pmulhrsw m3, m7
23102 movh m4, [r4 + 24]
23103 movh m5, [r4 + 25]
23104 punpcklbw m4, m5
23105 pmaddubsw m5, m4, m6
23106 pmulhrsw m5, m7
23107 packuswb m3, m5
23108 movu [r0 + 961 * 16], m3
23109
23110 ; mode17 [row 1]
23111 movu m6, [r5 + 12 * 16]
23112 pslldq m0, 2
23113 pinsrb m0, [r3 + 0], 1
23114 pinsrb m0, [r3 + 1], 0
23115 pmaddubsw m3, m0, m6
23116 pmulhrsw m3, m7
23117 pslldq m2, 2
23118 pinsrw m2, [r4 + 7], 0
23119 pmaddubsw m5, m2, m6
23120 pmulhrsw m5, m7
23121 packuswb m3, m5
23122 movu [r0 + 962 * 16], m3
23123
23124 pslldq m1, 2
23125 pinsrw m1, [r4 + 15], 0
23126 pmaddubsw m3, m1, m6
23127 pmulhrsw m3, m7
23128 pslldq m4, 2
23129 pinsrw m4, [r4 + 23], 0
23130 pmaddubsw m5, m4, m6
23131 pmulhrsw m5, m7
23132 packuswb m3, m5
23133 movu [r0 + 963 * 16], m3
23134
23135 ; mode17 [row 2]
23136 movu m6, [r5 + 18 * 16]
23137 pslldq m0, 2
23138 pinsrb m0, [r3 + 1], 1
23139 pinsrb m0, [r3 + 2], 0
23140 pmaddubsw m3, m0, m6
23141 pmulhrsw m3, m7
23142 pslldq m2, 2
23143 pinsrw m2, [r4 + 6], 0
23144 pmaddubsw m5, m2, m6
23145 pmulhrsw m5, m7
23146 packuswb m3, m5
23147 movu [r0 + 964 * 16], m3
23148
23149 pslldq m1, 2
23150 pinsrw m1, [r4 + 14], 0
23151 pmaddubsw m3, m1, m6
23152 pmulhrsw m3, m7
23153 pslldq m4, 2
23154 pinsrw m4, [r4 + 22], 0
23155 pmaddubsw m5, m4, m6
23156 pmulhrsw m5, m7
23157 packuswb m3, m5
23158 movu [r0 + 965 * 16], m3
23159
23160 ; mode17 [row 3]
23161 movu m6, [r5 + 24 * 16]
23162 pslldq m0, 2
23163 pinsrb m0, [r3 + 2], 1
23164 pinsrb m0, [r3 + 4], 0
23165 pmaddubsw m3, m0, m6
23166 pmulhrsw m3, m7
23167 pslldq m2, 2
23168 pinsrw m2, [r4 + 5], 0
23169 pmaddubsw m5, m2, m6
23170 pmulhrsw m5, m7
23171 packuswb m3, m5
23172 movu [r0 + 966 * 16], m3
23173
23174 pslldq m1, 2
23175 pinsrw m1, [r4 + 13], 0
23176 pmaddubsw m3, m1, m6
23177 pmulhrsw m3, m7
23178 pslldq m4, 2
23179 pinsrw m4, [r4 + 21], 0
23180 pmaddubsw m5, m4, m6
23181 pmulhrsw m5, m7
23182 packuswb m3, m5
23183 movu [r0 + 967 * 16], m3
23184
23185 ; mode17 [row 4]
23186 movu m6, [r5 + 30 * 16]
23187 pslldq m0, 2
23188 pinsrb m0, [r3 + 4], 1
23189 pinsrb m0, [r3 + 5], 0
23190 pmaddubsw m3, m0, m6
23191 pmulhrsw m3, m7
23192 pslldq m2, 2
23193 pinsrw m2, [r4 + 4], 0
23194 pmaddubsw m5, m2, m6
23195 pmulhrsw m5, m7
23196 packuswb m3, m5
23197 movu [r0 + 968 * 16], m3
23198
23199 pslldq m1, 2
23200 pinsrw m1, [r4 + 12], 0
23201 pmaddubsw m3, m1, m6
23202 pmulhrsw m3, m7
23203 pslldq m4, 2
23204 pinsrw m4, [r4 + 20], 0
23205 pmaddubsw m5, m4, m6
23206 pmulhrsw m5, m7
23207 packuswb m3, m5
23208 movu [r0 + 969 * 16], m3
23209
23210 ; mode17 [row 5]
23211 movu m6, [r5 + 4 * 16]
23212 pmaddubsw m3, m0, m6
23213 pmulhrsw m3, m7
23214 pmaddubsw m5, m2, m6
23215 pmulhrsw m5, m7
23216 packuswb m3, m5
23217 movu [r0 + 970 * 16], m3
23218
23219 pmaddubsw m3, m1, m6
23220 pmulhrsw m3, m7
23221 pmaddubsw m5, m4, m6
23222 pmulhrsw m5, m7
23223 packuswb m3, m5
23224 movu [r0 + 971 * 16], m3
23225
23226 ; mode17 [row 6]
23227 movu m6, [r5 + 10 * 16]
23228 pslldq m0, 2
23229 pinsrb m0, [r3 + 5], 1
23230 pinsrb m0, [r3 + 6], 0
23231 pmaddubsw m3, m0, m6
23232 pmulhrsw m3, m7
23233 pslldq m2, 2
23234 pinsrw m2, [r4 + 3], 0
23235 pmaddubsw m5, m2, m6
23236 pmulhrsw m5, m7
23237 packuswb m3, m5
23238 movu [r0 + 972 * 16], m3
23239
23240 pslldq m1, 2
23241 pinsrw m1, [r4 + 11], 0
23242 pmaddubsw m3, m1, m6
23243 pmulhrsw m3, m7
23244 pslldq m4, 2
23245 pinsrw m4, [r4 + 19], 0
23246 pmaddubsw m5, m4, m6
23247 pmulhrsw m5, m7
23248 packuswb m3, m5
23249 movu [r0 + 973 * 16], m3
23250
23251 ; mode17 [row 7]
23252 movu m6, [r5 + 16 * 16]
23253 pslldq m0, 2
23254 pinsrb m0, [r3 + 6], 1
23255 pinsrb m0, [r3 + 7], 0
23256 pmaddubsw m3, m0, m6
23257 pmulhrsw m3, m7
23258 pslldq m2, 2
23259 pinsrw m2, [r4 + 2], 0
23260 pmaddubsw m5, m2, m6
23261 pmulhrsw m5, m7
23262 packuswb m3, m5
23263 movu [r0 + 974 * 16], m3
23264
23265 pslldq m1, 2
23266 pinsrw m1, [r4 + 10], 0
23267 pmaddubsw m3, m1, m6
23268 pmulhrsw m3, m7
23269 pslldq m4, 2
23270 pinsrw m4, [r4 + 18], 0
23271 pmaddubsw m5, m4, m6
23272 pmulhrsw m5, m7
23273 packuswb m3, m5
23274 movu [r0 + 975 * 16], m3
23275
23276 ; mode17 [row 8]
23277 movu m6, [r5 + 22 * 16]
23278 pslldq m0, 2
23279 pinsrb m0, [r3 + 7], 1
23280 pinsrb m0, [r3 + 9], 0
23281 pmaddubsw m3, m0, m6
23282 pmulhrsw m3, m7
23283 pslldq m2, 2
23284 pinsrw m2, [r4 + 1], 0
23285 pmaddubsw m5, m2, m6
23286 pmulhrsw m5, m7
23287 packuswb m3, m5
23288 movu [r0 + 976 * 16], m3
23289
23290 pslldq m1, 2
23291 pinsrw m1, [r4 + 9], 0
23292 pmaddubsw m3, m1, m6
23293 pmulhrsw m3, m7
23294 pslldq m4, 2
23295 pinsrw m4, [r4 + 17], 0
23296 pmaddubsw m5, m4, m6
23297 pmulhrsw m5, m7
23298 packuswb m3, m5
23299 movu [r0 + 977 * 16], m3
23300
23301 ; mode17 [row 9]
23302 movu m6, [r5 + 28 * 16]
23303 pslldq m0, 2
23304 pinsrb m0, [r3 + 9], 1
23305 pinsrb m0, [r3 + 10], 0
23306 pmaddubsw m3, m0, m6
23307 pmulhrsw m3, m7
23308 pslldq m2, 2
23309 pinsrw m2, [r4 + 0], 0
23310 pmaddubsw m5, m2, m6
23311 pmulhrsw m5, m7
23312 packuswb m3, m5
23313 movu [r0 + 978 * 16], m3
23314
23315 pslldq m1, 2
23316 pinsrw m1, [r4 + 8], 0
23317 pmaddubsw m3, m1, m6
23318 pmulhrsw m3, m7
23319 pslldq m4, 2
23320 pinsrw m4, [r4 + 16], 0
23321 pmaddubsw m5, m4, m6
23322 pmulhrsw m5, m7
23323 packuswb m3, m5
23324 movu [r0 + 979 * 16], m3
23325
23326 ; mode17 [row 10]
23327 movu m6, [r5 + 2 * 16]
23328 pmaddubsw m3, m0, m6
23329 pmulhrsw m3, m7
23330 pmaddubsw m5, m2, m6
23331 pmulhrsw m5, m7
23332 packuswb m3, m5
23333 movu [r0 + 980 * 16], m3
23334
23335 pmaddubsw m3, m1, m6
23336 pmulhrsw m3, m7
23337 pmaddubsw m5, m4, m6
23338 pmulhrsw m5, m7
23339 packuswb m3, m5
23340 movu [r0 + 981 * 16], m3
23341
23342 ; mode17 [row 11]
23343 movu m6, [r5 + 8 * 16]
23344 pslldq m0, 2
23345 pinsrb m0, [r3 + 10], 1
23346 pinsrb m0, [r3 + 11], 0
23347 pmaddubsw m3, m0, m6
23348 pmulhrsw m3, m7
23349 pslldq m2, 2
23350 pinsrb m2, [r4 + 0], 1
23351 pinsrb m2, [r3 + 1], 0
23352 pmaddubsw m5, m2, m6
23353 pmulhrsw m5, m7
23354 packuswb m3, m5
23355 movu [r0 + 982 * 16], m3
23356
23357 pslldq m1, 2
23358 pinsrw m1, [r4 + 7], 0
23359 pmaddubsw m3, m1, m6
23360 pmulhrsw m3, m7
23361 pslldq m4, 2
23362 pinsrw m4, [r4 + 15], 0
23363 pmaddubsw m5, m4, m6
23364 pmulhrsw m5, m7
23365 packuswb m3, m5
23366 movu [r0 + 983 * 16], m3
23367
23368 ; mode17 [row 12]
23369 movu m6, [r5 + 14 * 16]
23370 pslldq m0, 2
23371 pinsrb m0, [r3 + 11], 1
23372 pinsrb m0, [r3 + 12], 0
23373 pmaddubsw m3, m0, m6
23374 pmulhrsw m3, m7
23375 pslldq m2, 2
23376 pinsrb m2, [r3 + 1], 1
23377 pinsrb m2, [r3 + 2], 0
23378 pmaddubsw m5, m2, m6
23379 pmulhrsw m5, m7
23380 packuswb m3, m5
23381 movu [r0 + 984 * 16], m3
23382
23383 pslldq m1, 2
23384 pinsrw m1, [r4 + 6], 0
23385 pmaddubsw m3, m1, m6
23386 pmulhrsw m3, m7
23387 pslldq m4, 2
23388 pinsrw m4, [r4 + 14], 0
23389 pmaddubsw m5, m4, m6
23390 pmulhrsw m5, m7
23391 packuswb m3, m5
23392 movu [r0 + 985 * 16], m3
23393
23394 ; mode17 [row 13]
23395 movu m6, [r5 + 20 * 16]
23396 pslldq m0, 2
23397 pinsrb m0, [r3 + 12], 1
23398 pinsrb m0, [r3 + 14], 0
23399 pmaddubsw m3, m0, m6
23400 pmulhrsw m3, m7
23401 pslldq m2, 2
23402 pinsrb m2, [r3 + 2], 1
23403 pinsrb m2, [r3 + 4], 0
23404 pmaddubsw m5, m2, m6
23405 pmulhrsw m5, m7
23406 packuswb m3, m5
23407 movu [r0 + 986 * 16], m3
23408
23409 pslldq m1, 2
23410 pinsrw m1, [r4 + 5], 0
23411 pmaddubsw m3, m1, m6
23412 pmulhrsw m3, m7
23413 pslldq m4, 2
23414 pinsrw m4, [r4 + 13], 0
23415 pmaddubsw m5, m4, m6
23416 pmulhrsw m5, m7
23417 packuswb m3, m5
23418 movu [r0 + 987 * 16], m3
23419
23420 ; mode17 [row 14]
23421 movu m6, [r5 + 26 * 16]
23422 pslldq m0, 2
23423 pinsrb m0, [r3 + 14], 1
23424 pinsrb m0, [r3 + 15], 0
23425 pmaddubsw m3, m0, m6
23426 pmulhrsw m3, m7
23427 pslldq m2, 2
23428 pinsrb m2, [r3 + 4], 1
23429 pinsrb m2, [r3 + 5], 0
23430 pmaddubsw m5, m2, m6
23431 pmulhrsw m5, m7
23432 packuswb m3, m5
23433 movu [r0 + 988 * 16], m3
23434
23435 pslldq m1, 2
23436 pinsrw m1, [r4 + 4], 0
23437 pmaddubsw m3, m1, m6
23438 pmulhrsw m3, m7
23439 pslldq m4, 2
23440 pinsrw m4, [r4 + 12], 0
23441 pmaddubsw m5, m4, m6
23442 pmulhrsw m5, m7
23443 packuswb m3, m5
23444 movu [r0 + 989 * 16], m3
23445
23446 ; mode17 [row 15]
23447 pshufb m5, m0, [tab_S2]
23448 movh [r0 + 990 * 16], m5
23449 pshufb m5, m2, [tab_S2]
23450 movh [r0 + 990 * 16 + 8], m5
23451 pshufb m5, m1, [tab_S2]
23452 movh [r0 + 991 * 16], m5
23453 pshufb m5, m4, [tab_S2]
23454 movh [r0 + 991 * 16 + 8], m5
23455
23456 ; mode17 [row 16]
23457 movu m6, [r5 + 6 * 16]
23458 pslldq m0, 2
23459 pinsrb m0, [r3 + 15], 1
23460 pinsrb m0, [r3 + 16], 0
23461 pmaddubsw m3, m0, m6
23462 pmulhrsw m3, m7
23463 pslldq m2, 2
23464 pinsrb m2, [r3 + 5], 1
23465 pinsrb m2, [r3 + 6], 0
23466 pmaddubsw m5, m2, m6
23467 pmulhrsw m5, m7
23468 packuswb m3, m5
23469 movu [r0 + 992 * 16], m3
23470
23471 pslldq m1, 2
23472 pinsrw m1, [r4 + 3], 0
23473 pmaddubsw m3, m1, m6
23474 pmulhrsw m3, m7
23475 pslldq m4, 2
23476 pinsrw m4, [r4 + 11], 0
23477 pmaddubsw m5, m4, m6
23478 pmulhrsw m5, m7
23479 packuswb m3, m5
23480 movu [r0 + 993 * 16], m3
23481
23482 ; mode17 [row 17]
23483 movu m6, [r5 + 12 * 16]
23484 pslldq m0, 2
23485 pinsrb m0, [r3 + 16], 1
23486 pinsrb m0, [r3 + 17], 0
23487 pmaddubsw m3, m0, m6
23488 pmulhrsw m3, m7
23489 pslldq m2, 2
23490 pinsrb m2, [r3 + 6], 1
23491 pinsrb m2, [r3 + 7], 0
23492 pmaddubsw m5, m2, m6
23493 pmulhrsw m5, m7
23494 packuswb m3, m5
23495 movu [r0 + 994 * 16], m3
23496
23497 pslldq m1, 2
23498 pinsrw m1, [r4 + 2], 0
23499 pmaddubsw m3, m1, m6
23500 pmulhrsw m3, m7
23501 pslldq m4, 2
23502 pinsrw m4, [r4 + 10], 0
23503 pmaddubsw m5, m4, m6
23504 pmulhrsw m5, m7
23505 packuswb m3, m5
23506 movu [r0 + 995 * 16], m3
23507
23508 ; mode17 [row 18]
23509 movu m6, [r5 + 18 * 16]
23510 pslldq m0, 2
23511 pinsrb m0, [r3 + 17], 1
23512 pinsrb m0, [r3 + 18], 0
23513 pmaddubsw m3, m0, m6
23514 pmulhrsw m3, m7
23515 pslldq m2, 2
23516 pinsrb m2, [r3 + 7], 1
23517 pinsrb m2, [r3 + 9], 0
23518 pmaddubsw m5, m2, m6
23519 pmulhrsw m5, m7
23520 packuswb m3, m5
23521 movu [r0 + 996 * 16], m3
23522
23523 pslldq m1, 2
23524 pinsrw m1, [r4 + 1], 0
23525 pmaddubsw m3, m1, m6
23526 pmulhrsw m3, m7
23527 pslldq m4, 2
23528 pinsrw m4, [r4 + 9], 0
23529 pmaddubsw m5, m4, m6
23530 pmulhrsw m5, m7
23531 packuswb m3, m5
23532 movu [r0 + 997 * 16], m3
23533
23534 ; mode17 [row 19]
23535 movu m6, [r5 + 24 * 16]
23536 pslldq m0, 2
23537 pinsrb m0, [r3 + 18], 1
23538 pinsrb m0, [r3 + 20], 0
23539 pmaddubsw m3, m0, m6
23540 pmulhrsw m3, m7
23541 pslldq m2, 2
23542 pinsrb m2, [r3 + 9], 1
23543 pinsrb m2, [r3 + 10], 0
23544 pmaddubsw m5, m2, m6
23545 pmulhrsw m5, m7
23546 packuswb m3, m5
23547 movu [r0 + 998 * 16], m3
23548
23549 pslldq m1, 2
23550 pinsrw m1, [r4 + 0], 0
23551 pmaddubsw m3, m1, m6
23552 pmulhrsw m3, m7
23553 pslldq m4, 2
23554 pinsrw m4, [r4 + 8], 0
23555 pmaddubsw m5, m4, m6
23556 pmulhrsw m5, m7
23557 packuswb m3, m5
23558 movu [r0 + 999 * 16], m3
23559
23560 ; mode17 [row 20]
23561 movu m6, [r5 + 30 * 16]
23562 pslldq m0, 2
23563 pinsrb m0, [r3 + 20], 1
23564 pinsrb m0, [r3 + 21], 0
23565 pmaddubsw m3, m0, m6
23566 pmulhrsw m3, m7
23567 pslldq m2, 2
23568 pinsrb m2, [r3 + 10], 1
23569 pinsrb m2, [r3 + 11], 0
23570 pmaddubsw m5, m2, m6
23571 pmulhrsw m5, m7
23572 packuswb m3, m5
23573 movu [r0 + 1000 * 16], m3
23574
23575 pslldq m1, 2
23576 pinsrb m1, [r4 + 0], 1
23577 pinsrb m1, [r3 + 1], 0
23578 pmaddubsw m3, m1, m6
23579 pmulhrsw m3, m7
23580 pslldq m4, 2
23581 ;pinsrb m4, [r4 + 8], 1
23582 ;pinsrb m4, [r4 + 7], 0
23583 pinsrw m4, [r4 + 7], 0
23584 pmaddubsw m5, m4, m6
23585 pmulhrsw m5, m7
23586 packuswb m3, m5
23587 movu [r0 + 1001 * 16], m3
23588
23589 ; mode17 [row 21]
23590 movu m6, [r5 + 4 * 16]
23591 pmaddubsw m3, m0, m6
23592 pmulhrsw m3, m7
23593 pmaddubsw m5, m2, m6
23594 pmulhrsw m5, m7
23595 packuswb m3, m5
23596 movu [r0 + 1002 * 16], m3
23597
23598 pmaddubsw m3, m1, m6
23599 pmulhrsw m3, m7
23600 pmaddubsw m5, m4, m6
23601 pmulhrsw m5, m7
23602 packuswb m3, m5
23603 movu [r0 + 1003 * 16], m3
23604
23605 ; mode17 [row 22]
23606 movu m6, [r5 + 10 * 16]
23607 pslldq m0, 2
23608 pinsrb m0, [r3 + 21], 1
23609 pinsrb m0, [r3 + 22], 0
23610 pmaddubsw m3, m0, m6
23611 pmulhrsw m3, m7
23612 pslldq m2, 2
23613 pinsrb m2, [r3 + 11], 1
23614 pinsrb m2, [r3 + 12], 0
23615 pmaddubsw m5, m2, m6
23616 pmulhrsw m5, m7
23617 packuswb m3, m5
23618 movu [r0 + 1004 * 16], m3
23619
23620 pslldq m1, 2
23621 pinsrb m1, [r3 + 1], 1
23622 pinsrb m1, [r3 + 2], 0
23623 pmaddubsw m3, m1, m6
23624 pmulhrsw m3, m7
23625 pslldq m4, 2
23626 pinsrw m4, [r4 + 6], 0
23627 pmaddubsw m5, m4, m6
23628 pmulhrsw m5, m7
23629 packuswb m3, m5
23630 movu [r0 + 1005 * 16], m3
23631
23632 ; mode17 [row 23]
23633 movu m6, [r5 + 16 * 16]
23634 pslldq m0, 2
23635 pinsrb m0, [r3 + 22], 1
23636 pinsrb m0, [r3 + 23], 0
23637 pmaddubsw m3, m0, m6
23638 pmulhrsw m3, m7
23639 pslldq m2, 2
23640 pinsrb m2, [r3 + 12], 1
23641 pinsrb m2, [r3 + 14], 0
23642 pmaddubsw m5, m2, m6
23643 pmulhrsw m5, m7
23644 packuswb m3, m5
23645 movu [r0 + 1006 * 16], m3
23646
23647 pslldq m1, 2
23648 pinsrb m1, [r3 + 2], 1
23649 pinsrb m1, [r3 + 4], 0
23650 pmaddubsw m3, m1, m6
23651 pmulhrsw m3, m7
23652 pslldq m4, 2
23653 pinsrw m4, [r4 + 5], 0
23654 pmaddubsw m5, m4, m6
23655 pmulhrsw m5, m7
23656 packuswb m3, m5
23657 movu [r0 + 1007 * 16], m3
23658
23659 ; mode17 [row 24]
23660 movu m6, [r5 + 22 * 16]
23661 pslldq m0, 2
23662 pinsrb m0, [r3 + 23], 1
23663 pinsrb m0, [r3 + 25], 0
23664 pmaddubsw m3, m0, m6
23665 pmulhrsw m3, m7
23666 pslldq m2, 2
23667 pinsrb m2, [r3 + 14], 1
23668 pinsrb m2, [r3 + 15], 0
23669 pmaddubsw m5, m2, m6
23670 pmulhrsw m5, m7
23671 packuswb m3, m5
23672 movu [r0 + 1008 * 16], m3
23673
23674 pslldq m1, 2
23675 pinsrb m1, [r3 + 4], 1
23676 pinsrb m1, [r3 + 5], 0
23677 pmaddubsw m3, m1, m6
23678 pmulhrsw m3, m7
23679 pslldq m4, 2
23680 pinsrw m4, [r4 + 4], 0
23681 pmaddubsw m5, m4, m6
23682 pmulhrsw m5, m7
23683 packuswb m3, m5
23684 movu [r0 + 1009 * 16], m3
23685
23686 ; mode17 [row 25]
23687 movu m6, [r5 + 28 * 16]
23688 pslldq m0, 2
23689 pinsrb m0, [r3 + 25], 1
23690 pinsrb m0, [r3 + 26], 0
23691 pmaddubsw m3, m0, m6
23692 pmulhrsw m3, m7
23693 pslldq m2, 2
23694 pinsrb m2, [r3 + 15], 1
23695 pinsrb m2, [r3 + 16], 0
23696 pmaddubsw m5, m2, m6
23697 pmulhrsw m5, m7
23698 packuswb m3, m5
23699 movu [r0 + 1010 * 16], m3
23700
23701 pslldq m1, 2
23702 pinsrb m1, [r3 + 5], 1
23703 pinsrb m1, [r3 + 6], 0
23704 pmaddubsw m3, m1, m6
23705 pmulhrsw m3, m7
23706 pslldq m4, 2
23707 pinsrw m4, [r4 + 3], 0
23708 pmaddubsw m5, m4, m6
23709 pmulhrsw m5, m7
23710 packuswb m3, m5
23711 movu [r0 + 1011 * 16], m3
23712
23713 ; mode17 [row 26]
23714 movu m6, [r5 + 2 * 16]
23715 pmaddubsw m3, m0, m6
23716 pmulhrsw m3, m7
23717 pmaddubsw m5, m2, m6
23718 pmulhrsw m5, m7
23719 packuswb m3, m5
23720 movu [r0 + 1012 * 16], m3
23721
23722 pmaddubsw m3, m1, m6
23723 pmulhrsw m3, m7
23724 pmaddubsw m5, m4, m6
23725 pmulhrsw m5, m7
23726 packuswb m3, m5
23727 movu [r0 + 1013 * 16], m3
23728
23729 ; mode17 [row 27]
23730 movu m6, [r5 + 8 * 16]
23731 pslldq m0, 2
23732 pinsrb m0, [r3 + 26], 1
23733 pinsrb m0, [r3 + 27], 0
23734 pmaddubsw m3, m0, m6
23735 pmulhrsw m3, m7
23736 pslldq m2, 2
23737 pinsrb m2, [r3 + 16], 1
23738 pinsrb m2, [r3 + 17], 0
23739 pmaddubsw m5, m2, m6
23740 pmulhrsw m5, m7
23741 packuswb m3, m5
23742 movu [r0 + 1014 * 16], m3
23743
23744 pslldq m1, 2
23745 pinsrb m1, [r3 + 6], 1
23746 pinsrb m1, [r3 + 7], 0
23747 pmaddubsw m3, m1, m6
23748 pmulhrsw m3, m7
23749 pslldq m4, 2
23750 pinsrw m4, [r4 + 2], 0
23751 pmaddubsw m5, m4, m6
23752 pmulhrsw m5, m7
23753 packuswb m3, m5
23754 movu [r0 + 1015 * 16], m3
23755
23756 ; mode17 [row 28]
23757 movu m6, [r5 + 14 * 16]
23758 pslldq m0, 2
23759 pinsrb m0, [r3 + 27], 1
23760 pinsrb m0, [r3 + 28], 0
23761 pmaddubsw m3, m0, m6
23762 pmulhrsw m3, m7
23763 pslldq m2, 2
23764 pinsrb m2, [r3 + 17], 1
23765 pinsrb m2, [r3 + 18], 0
23766 pmaddubsw m5, m2, m6
23767 pmulhrsw m5, m7
23768 packuswb m3, m5
23769 movu [r0 + 1016 * 16], m3
23770
23771 pslldq m1, 2
23772 pinsrb m1, [r3 + 7], 1
23773 pinsrb m1, [r3 + 9], 0
23774 pmaddubsw m3, m1, m6
23775 pmulhrsw m3, m7
23776 pslldq m4, 2
23777 pinsrw m4, [r4 + 1], 0
23778 pmaddubsw m5, m4, m6
23779 pmulhrsw m5, m7
23780 packuswb m3, m5
23781 movu [r0 + 1017 * 16], m3
23782
23783 ; mode17 [row 29]
23784 movu m6, [r5 + 20 * 16]
23785 pslldq m0, 2
23786 pinsrb m0, [r3 + 28], 1
23787 pinsrb m0, [r3 + 30], 0
23788 pmaddubsw m3, m0, m6
23789 pmulhrsw m3, m7
23790 pslldq m2, 2
23791 pinsrb m2, [r3 + 18], 1
23792 pinsrb m2, [r3 + 20], 0
23793 pmaddubsw m5, m2, m6
23794 pmulhrsw m5, m7
23795 packuswb m3, m5
23796 movu [r0 + 1018 * 16], m3
23797
23798 pslldq m1, 2
23799 pinsrb m1, [r3 + 9], 1
23800 pinsrb m1, [r3 + 10], 0
23801 pmaddubsw m3, m1, m6
23802 pmulhrsw m3, m7
23803 pslldq m4, 2
23804 pinsrw m4, [r4 + 0], 0
23805 pmaddubsw m5, m4, m6
23806 pmulhrsw m5, m7
23807 packuswb m3, m5
23808 movu [r0 + 1019 * 16], m3
23809
23810 ; mode17 [row 30]
23811 movu m6, [r5 + 26 * 16]
23812 pslldq m0, 2
23813 pinsrb m0, [r3 + 30], 1
23814 pinsrb m0, [r3 + 31], 0
23815 pmaddubsw m3, m0, m6
23816 pmulhrsw m3, m7
23817 pslldq m2, 2
23818 pinsrb m2, [r3 + 20], 1
23819 pinsrb m2, [r3 + 21], 0
23820 pmaddubsw m5, m2, m6
23821 pmulhrsw m5, m7
23822 packuswb m3, m5
23823 movu [r0 + 1020 * 16], m3
23824
23825 pslldq m1, 2
23826 pinsrb m1, [r3 + 10], 1
23827 pinsrb m1, [r3 + 11], 0
23828 pmaddubsw m3, m1, m6
23829 pmulhrsw m3, m7
23830 pslldq m4, 2
23831 pinsrb m4, [r4 + 0], 1
23832 pinsrb m4, [r3 + 1], 0
23833 pmaddubsw m5, m4, m6
23834 pmulhrsw m5, m7
23835 packuswb m3, m5
23836 movu [r0 + 1021 * 16], m3
23837
23838 ; mode17 [row 31]
23839 pshufb m5, m0, [tab_S2]
23840 movh [r0 + 1022 * 16], m5
23841 pshufb m5, m2, [tab_S2]
23842 movh [r0 + 1022 * 16 + 8], m5
23843 pshufb m5, m1, [tab_S2]
23844 movh [r0 + 1023 * 16], m5
23845 pshufb m5, m4, [tab_S2]
23846 movh [r0 + 1023 * 16 + 8], m5
23847
23848 ;mode 18[row 0]
23849 movu m0, [r3]
23850 movu [r0 + 1024 * 16], m0
23851 movu m1, [r3 + 16]
23852 movu [r0 + 1025 * 16], m1
23853
23854 ;mode 18[row 1]
23855 pslldq m0, 1
23856 pinsrb m0, [r4 + 1], 0
23857 movu [r0 + 1026 * 16], m0
23858 pslldq m1, 1
23859 pinsrb m1, [r3 + 15], 0
23860 movu [r0 + 1027 * 16], m1
23861
23862 ;mode 18[row 2]
23863 pslldq m0, 1
23864 pinsrb m0, [r4 + 2], 0
23865 movu [r0 + 1028 * 16], m0
23866 pslldq m1, 1
23867 pinsrb m1, [r3 + 14], 0
23868 movu [r0 + 1029 * 16], m1
23869
23870 ;mode 18[row 3]
23871 pslldq m0, 1
23872 pinsrb m0, [r4 + 3], 0
23873 movu [r0 + 1030 * 16], m0
23874 pslldq m1, 1
23875 pinsrb m1, [r3 + 13], 0
23876 movu [r0 + 1031 * 16], m1
23877
23878 ;mode 18[row 4]
23879 pslldq m0, 1
23880 pinsrb m0, [r4 + 4], 0
23881 movu [r0 + 1032 * 16], m0
23882 pslldq m1, 1
23883 pinsrb m1, [r3 + 12], 0
23884 movu [r0 + 1033 * 16], m1
23885
23886 ;mode 18[row 5]
23887 pslldq m0, 1
23888 pinsrb m0, [r4 + 5], 0
23889 movu [r0 + 1034 * 16], m0
23890 pslldq m1, 1
23891 pinsrb m1, [r3 + 11], 0
23892 movu [r0 + 1035 * 16], m1
23893
23894 ;mode 18[row 6]
23895 pslldq m0, 1
23896 pinsrb m0, [r4 + 6], 0
23897 movu [r0 + 1036 * 16], m0
23898 pslldq m1, 1
23899 pinsrb m1, [r3 + 10], 0
23900 movu [r0 + 1037 * 16], m1
23901
23902 ;mode 18[row 7]
23903 pslldq m0, 1
23904 pinsrb m0, [r4 + 7], 0
23905 movu [r0 + 1038 * 16], m0
23906 pslldq m1, 1
23907 pinsrb m1, [r3 + 9], 0
23908 movu [r0 + 1039 * 16], m1
23909
23910 ;mode 18[row 8]
23911 pslldq m0, 1
23912 pinsrb m0, [r4 + 8], 0
23913 movu [r0 + 1040 * 16], m0
23914 pslldq m1, 1
23915 pinsrb m1, [r3 + 8], 0
23916 movu [r0 + 1041 * 16], m1
23917
23918 ;mode 18[row 9]
23919 pslldq m0, 1
23920 pinsrb m0, [r4 + 9], 0
23921 movu [r0 + 1042 * 16], m0
23922 pslldq m1, 1
23923 pinsrb m1, [r3 + 7], 0
23924 movu [r0 + 1043 * 16], m1
23925
23926 ;mode 18[row 10]
23927 pslldq m0, 1
23928 pinsrb m0, [r4 + 10], 0
23929 movu [r0 + 1044 * 16], m0
23930 pslldq m1, 1
23931 pinsrb m1, [r3 + 6], 0
23932 movu [r0 + 1045 * 16], m1
23933
23934 ;mode 18[row 11]
23935 pslldq m0, 1
23936 pinsrb m0, [r4 + 11], 0
23937 movu [r0 + 1046 * 16], m0
23938 pslldq m1, 1
23939 pinsrb m1, [r3 + 5], 0
23940 movu [r0 + 1047 * 16], m1
23941
23942 ;mode 18[row 12]
23943 pslldq m0, 1
23944 pinsrb m0, [r4 + 12], 0
23945 movu [r0 + 1048 * 16], m0
23946 pslldq m1, 1
23947 pinsrb m1, [r3 + 4], 0
23948 movu [r0 + 1049 * 16], m1
23949
23950 ;mode 18[row 13]
23951 pslldq m0, 1
23952 pinsrb m0, [r4 + 13], 0
23953 movu [r0 + 1050 * 16], m0
23954 pslldq m1, 1
23955 pinsrb m1, [r3 + 3], 0
23956 movu [r0 + 1051 * 16], m1
23957
23958 ;mode 18[row 14]
23959 pslldq m0, 1
23960 pinsrb m0, [r4 + 14], 0
23961 movu [r0 + 1052 * 16], m0
23962 pslldq m1, 1
23963 pinsrb m1, [r3 + 2], 0
23964 movu [r0 + 1053 * 16], m1
23965
23966 ;mode 18[row 15]
23967 pslldq m0, 1
23968 pinsrb m0, [r4 + 15], 0
23969 movu [r0 + 1054 * 16], m0
23970 pslldq m1, 1
23971 pinsrb m1, [r3 + 1], 0
23972 movu [r0 + 1055 * 16], m1
23973
23974 ;mode 18[row 16]
23975 pslldq m0, 1
23976 pinsrb m0, [r4 + 16], 0
23977 movu [r0 + 1056 * 16], m0
23978 pslldq m1, 1
23979 pinsrb m1, [r3 + 0], 0
23980 movu [r0 + 1057 * 16], m1
23981
23982 ;mode 18[row 17]
23983 pslldq m0, 1
23984 pinsrb m0, [r4 + 17], 0
23985 movu [r0 + 1058 * 16], m0
23986 pslldq m1, 1
23987 pinsrb m1, [r4 + 1], 0
23988 movu [r0 + 1059 * 16], m1
23989
23990 ;mode 18[row 18]
23991 pslldq m0, 1
23992 pinsrb m0, [r4 + 18], 0
23993 movu [r0 + 1060 * 16], m0
23994 pslldq m1, 1
23995 pinsrb m1, [r4 + 2], 0
23996 movu [r0 + 1061 * 16], m1
23997
23998 ;mode 18[row 19]
23999 pslldq m0, 1
24000 pinsrb m0, [r4 + 19], 0
24001 movu [r0 + 1062 * 16], m0
24002 pslldq m1, 1
24003 pinsrb m1, [r4 + 3], 0
24004 movu [r0 + 1063 * 16], m1
24005
24006 ;mode 18[row 20]
24007 pslldq m0, 1
24008 pinsrb m0, [r4 + 20], 0
24009 movu [r0 + 1064 * 16], m0
24010 pslldq m1, 1
24011 pinsrb m1, [r4 + 4], 0
24012 movu [r0 + 1065 * 16], m1
24013
24014 ;mode 18[row 21]
24015 pslldq m0, 1
24016 pinsrb m0, [r4 + 21], 0
24017 movu [r0 + 1066 * 16], m0
24018 pslldq m1, 1
24019 pinsrb m1, [r4 + 5], 0
24020 movu [r0 + 1067 * 16], m1
24021
24022 ;mode 18[row 22]
24023 pslldq m0, 1
24024 pinsrb m0, [r4 + 22], 0
24025 movu [r0 + 1068 * 16], m0
24026 pslldq m1, 1
24027 pinsrb m1, [r4 + 6], 0
24028 movu [r0 + 1069 * 16], m1
24029
24030 ;mode 18[row 23]
24031 pslldq m0, 1
24032 pinsrb m0, [r4 + 23], 0
24033 movu [r0 + 1070 * 16], m0
24034 pslldq m1, 1
24035 pinsrb m1, [r4 + 7], 0
24036 movu [r0 + 1071 * 16], m1
24037
24038 ;mode 18[row 24]
24039 pslldq m0, 1
24040 pinsrb m0, [r4 + 24], 0
24041 movu [r0 + 1072 * 16], m0
24042 pslldq m1, 1
24043 pinsrb m1, [r4 + 8], 0
24044 movu [r0 + 1073 * 16], m1
24045
24046 ;mode 18[row 25]
24047 pslldq m0, 1
24048 pinsrb m0, [r4 + 25], 0
24049 movu [r0 + 1074 * 16], m0
24050 pslldq m1, 1
24051 pinsrb m1, [r4 + 9], 0
24052 movu [r0 + 1075 * 16], m1
24053
24054 ;mode 18[row 26]
24055 pslldq m0, 1
24056 pinsrb m0, [r4 + 26], 0
24057 movu [r0 + 1076 * 16], m0
24058 pslldq m1, 1
24059 pinsrb m1, [r4 + 10], 0
24060 movu [r0 + 1077 * 16], m1
24061
24062 ;mode 18[row 27]
24063 pslldq m0, 1
24064 pinsrb m0, [r4 + 27], 0
24065 movu [r0 + 1078 * 16], m0
24066 pslldq m1, 1
24067 pinsrb m1, [r4 + 11], 0
24068 movu [r0 + 1079 * 16], m1
24069
24070 ;mode 18[row 28]
24071 pslldq m0, 1
24072 pinsrb m0, [r4 + 28], 0
24073 movu [r0 + 1080 * 16], m0
24074 pslldq m1, 1
24075 pinsrb m1, [r4 + 12], 0
24076 movu [r0 + 1081 * 16], m1
24077
24078 ;mode 18[row 29]
24079 pslldq m0, 1
24080 pinsrb m0, [r4 + 29], 0
24081 movu [r0 + 1082 * 16], m0
24082 pslldq m1, 1
24083 pinsrb m1, [r4 + 13], 0
24084 movu [r0 + 1083 * 16], m1
24085
24086 ;mode 18[row 30]
24087 pslldq m0, 1
24088 pinsrb m0, [r4 + 30], 0
24089 movu [r0 + 1084 * 16], m0
24090 pslldq m1, 1
24091 pinsrb m1, [r4 + 14], 0
24092 movu [r0 + 1085 * 16], m1
24093
24094 ;mode 18[row 31]
24095 pslldq m0, 1
24096 pinsrb m0, [r4 + 31], 0
24097 movu [r0 + 1086 * 16], m0
24098 pslldq m1, 1
24099 pinsrb m1, [r4 + 15], 0
24100 movu [r0 + 1087 * 16], m1
24101
24102 ; mode 19 [row 0]
24103 movu m6, [r5 + 6 * 16]
24104 movu m0, [r3 ]
24105 movu m1, [r3 + 1 ]
24106 punpcklbw m0, m1
24107 pmaddubsw m1, m0, m6
24108 pmulhrsw m1, m7
24109 movu m2, [r3 + 8]
24110 movu m3, [r3 + 9]
24111 punpcklbw m2, m3
24112 pmaddubsw m3, m2, m6
24113 pmulhrsw m3, m7
24114 packuswb m1, m3
24115 movu [r0 + 1088 * 16], m1
24116
24117 movu m1, [r3 + 16]
24118 movu m3, [r3 + 17]
24119 punpcklbw m1, m3
24120 pmaddubsw m4, m1, m6
24121 pmulhrsw m4, m7
24122 movu m3, [r3 + 24]
24123 movu m5, [r3 + 25]
24124 punpcklbw m3, m5
24125 pmaddubsw m5, m3, m6
24126 pmulhrsw m5, m7
24127 packuswb m4, m5
24128 movu [r0 + 1089 * 16], m4
24129
24130 ; mode 19 [row 1]
24131 movu m6, [r5 + 12 * 16]
24132 pslldq m0, 2
24133 pinsrb m0, [r4 + 0], 1
24134 pinsrb m0, [r4 + 1], 0
24135 pmaddubsw m4, m0, m6
24136 pmulhrsw m4, m7
24137 pslldq m2, 2
24138 pinsrw m2, [r3 + 7], 0
24139 pmaddubsw m5, m2, m6
24140 pmulhrsw m5, m7
24141 packuswb m4, m5
24142 movu [r0 + 1090 * 16], m4
24143 pslldq m1, 2
24144 pinsrw m1, [r3 + 15], 0
24145 pmaddubsw m4, m1, m6
24146 pmulhrsw m4, m7
24147 pslldq m3, 2
24148 pinsrw m3, [r3 + 23], 0
24149 pmaddubsw m5, m3, m6
24150 pmulhrsw m5, m7
24151 packuswb m4, m5
24152 movu [r0 + 1091 * 16], m4
24153
24154 ; mode 19 [row 2]
24155 movu m6, [r5 + 18 * 16]
24156 pslldq m0, 2
24157 pinsrb m0, [r4 + 1], 1
24158 pinsrb m0, [r4 + 2], 0
24159 pmaddubsw m4, m0, m6
24160 pmulhrsw m4, m7
24161 pslldq m2, 2
24162 pinsrw m2, [r3 + 6], 0
24163 pmaddubsw m5, m2, m6
24164 pmulhrsw m5, m7
24165 packuswb m4, m5
24166 movu [r0 + 1092 * 16], m4
24167 pslldq m1, 2
24168 pinsrw m1, [r3 + 14], 0
24169 pmaddubsw m4, m1, m6
24170 pmulhrsw m4, m7
24171 pslldq m3, 2
24172 pinsrw m3, [r3 + 22], 0
24173 pmaddubsw m5, m3, m6
24174 pmulhrsw m5, m7
24175 packuswb m4, m5
24176 movu [r0 + 1093 * 16], m4
24177
24178 ; mode 19 [row 3]
24179 movu m6, [r5 + 24 * 16]
24180 pslldq m0, 2
24181 pinsrb m0, [r4 + 2], 1
24182 pinsrb m0, [r4 + 4], 0
24183 pmaddubsw m4, m0, m6
24184 pmulhrsw m4, m7
24185 pslldq m2, 2
24186 pinsrw m2, [r3 + 5], 0
24187 pmaddubsw m5, m2, m6
24188 pmulhrsw m5, m7
24189 packuswb m4, m5
24190 movu [r0 + 1094 * 16], m4
24191 pslldq m1, 2
24192 pinsrw m1, [r3 + 13], 0
24193 pmaddubsw m4, m1, m6
24194 pmulhrsw m4, m7
24195 pslldq m3, 2
24196 pinsrw m3, [r3 + 21], 0
24197 pmaddubsw m5, m3, m6
24198 pmulhrsw m5, m7
24199 packuswb m4, m5
24200 movu [r0 + 1095 * 16], m4
24201
24202 ; mode 19 [row 4]
24203 movu m6, [r5 + 30 * 16]
24204 pslldq m0, 2
24205 pinsrb m0, [r4 + 4], 1
24206 pinsrb m0, [r4 + 5], 0
24207 pmaddubsw m4, m0, m6
24208 pmulhrsw m4, m7
24209 pslldq m2, 2
24210 pinsrw m2, [r3 + 4], 0
24211 pmaddubsw m5, m2, m6
24212 pmulhrsw m5, m7
24213 packuswb m4, m5
24214 movu [r0 + 1096 * 16], m4
24215 pslldq m1, 2
24216 pinsrw m1, [r3 + 12], 0
24217 pmaddubsw m4, m1, m6
24218 pmulhrsw m4, m7
24219 pslldq m3, 2
24220 pinsrw m3, [r3 + 20], 0
24221 pmaddubsw m5, m3, m6
24222 pmulhrsw m5, m7
24223 packuswb m4, m5
24224 movu [r0 + 1097 * 16], m4
24225
24226 ; mode 19 [row 5]
24227 movu m6, [r5 + 4 * 16]
24228 pmaddubsw m4, m0, m6
24229 pmulhrsw m4, m7
24230 pmaddubsw m5, m2, m6
24231 pmulhrsw m5, m7
24232 packuswb m4, m5
24233 movu [r0 + 1098 * 16], m4
24234 pmaddubsw m4, m1, m6
24235 pmulhrsw m4, m7
24236 pmaddubsw m5, m3, m6
24237 pmulhrsw m5, m7
24238 packuswb m4, m5
24239 movu [r0 + 1099 * 16], m4
24240
24241 ; mode 19 [row 6]
24242 movu m6, [r5 + 10 * 16]
24243 pslldq m0, 2
24244 pinsrb m0, [r4 + 5], 1
24245 pinsrb m0, [r4 + 6], 0
24246 pmaddubsw m4, m0, m6
24247 pmulhrsw m4, m7
24248 pslldq m2, 2
24249 pinsrw m2, [r3 + 3], 0
24250 pmaddubsw m5, m2, m6
24251 pmulhrsw m5, m7
24252 packuswb m4, m5
24253 movu [r0 + 1100 * 16], m4
24254 pslldq m1, 2
24255 pinsrw m1, [r3 + 11], 0
24256 pmaddubsw m4, m1, m6
24257 pmulhrsw m4, m7
24258 pslldq m3, 2
24259 pinsrw m3, [r3 + 19], 0
24260 pmaddubsw m5, m3, m6
24261 pmulhrsw m5, m7
24262 packuswb m4, m5
24263 movu [r0 + 1101 * 16], m4
24264
24265 ; mode 19 [row 7]
24266 movu m6, [r5 + 16 * 16]
24267 pslldq m0, 2
24268 pinsrb m0, [r4 + 6], 1
24269 pinsrb m0, [r4 + 7], 0
24270 pmaddubsw m4, m0, m6
24271 pmulhrsw m4, m7
24272 pslldq m2, 2
24273 pinsrw m2, [r3 + 2], 0
24274 pmaddubsw m5, m2, m6
24275 pmulhrsw m5, m7
24276 packuswb m4, m5
24277 movu [r0 + 1102 * 16], m4
24278 pslldq m1, 2
24279 pinsrw m1, [r3 + 10], 0
24280 pmaddubsw m4, m1, m6
24281 pmulhrsw m4, m7
24282 pslldq m3, 2
24283 pinsrw m3, [r3 + 18], 0
24284 pmaddubsw m5, m3, m6
24285 pmulhrsw m5, m7
24286 packuswb m4, m5
24287 movu [r0 + 1103 * 16], m4
24288
24289 ; mode 19 [row 8]
24290 movu m6, [r5 + 22 * 16]
24291 pslldq m0, 2
24292 pinsrb m0, [r4 + 7], 1
24293 pinsrb m0, [r4 + 9], 0
24294 pmaddubsw m4, m0, m6
24295 pmulhrsw m4, m7
24296 pslldq m2, 2
24297 pinsrw m2, [r3 + 1], 0
24298 pmaddubsw m5, m2, m6
24299 pmulhrsw m5, m7
24300 packuswb m4, m5
24301 movu [r0 + 1104 * 16], m4
24302 pslldq m1, 2
24303 pinsrw m1, [r3 + 9], 0
24304 pmaddubsw m4, m1, m6
24305 pmulhrsw m4, m7
24306 pslldq m3, 2
24307 pinsrw m3, [r3 + 17], 0
24308 pmaddubsw m5, m3, m6
24309 pmulhrsw m5, m7
24310 packuswb m4, m5
24311 movu [r0 + 1105 * 16], m4
24312
24313 ; mode 19 [row 9]
24314 movu m6, [r5 + 28 * 16]
24315 pslldq m0, 2
24316 pinsrb m0, [r4 + 9], 1
24317 pinsrb m0, [r4 + 10], 0
24318 pmaddubsw m4, m0, m6
24319 pmulhrsw m4, m7
24320 pslldq m2, 2
24321 pinsrw m2, [r3 + 0], 0
24322 pmaddubsw m5, m2, m6
24323 pmulhrsw m5, m7
24324 packuswb m4, m5
24325 movu [r0 + 1106 * 16], m4
24326 pslldq m1, 2
24327 pinsrw m1, [r3 + 8], 0
24328 pmaddubsw m4, m1, m6
24329 pmulhrsw m4, m7
24330 pslldq m3, 2
24331 pinsrw m3, [r3 + 16], 0
24332 pmaddubsw m5, m3, m6
24333 pmulhrsw m5, m7
24334 packuswb m4, m5
24335 movu [r0 + 1107 * 16], m4
24336
24337 ; mode 19 [row 10]
24338 movu m6, [r5 + 2 * 16]
24339 pmaddubsw m4, m0, m6
24340 pmulhrsw m4, m7
24341 pmaddubsw m5, m2, m6
24342 pmulhrsw m5, m7
24343 packuswb m4, m5
24344 movu [r0 + 1108 * 16], m4
24345 pmaddubsw m4, m1, m6
24346 pmulhrsw m4, m7
24347 pmaddubsw m5, m3, m6
24348 pmulhrsw m5, m7
24349 packuswb m4, m5
24350 movu [r0 + 1109 * 16], m4
24351
24352 ; mode 19 [row 11]
24353 movu m6, [r5 + 8 * 16]
24354 pslldq m0, 2
24355 pinsrb m0, [r4 + 10], 1
24356 pinsrb m0, [r4 + 11], 0
24357 pmaddubsw m4, m0, m6
24358 pmulhrsw m4, m7
24359 pslldq m2, 2
24360 pinsrb m2, [r3 + 0], 1
24361 pinsrb m2, [r4 + 1], 0
24362 pmaddubsw m5, m2, m6
24363 pmulhrsw m5, m7
24364 packuswb m4, m5
24365 movu [r0 + 1110 * 16], m4
24366 pslldq m1, 2
24367 pinsrw m1, [r3 + 7], 0
24368 pmaddubsw m4, m1, m6
24369 pmulhrsw m4, m7
24370 pslldq m3, 2
24371 pinsrw m3, [r3 + 15], 0
24372 pmaddubsw m5, m3, m6
24373 pmulhrsw m5, m7
24374 packuswb m4, m5
24375 movu [r0 + 1111 * 16], m4
24376
24377 ; mode 19 [row 12]
24378 movu m6, [r5 + 14 * 16]
24379 pslldq m0, 2
24380 pinsrb m0, [r4 + 11], 1
24381 pinsrb m0, [r4 + 12], 0
24382 pmaddubsw m4, m0, m6
24383 pmulhrsw m4, m7
24384 pslldq m2, 2
24385 pinsrb m2, [r4 + 1], 1
24386 pinsrb m2, [r4 + 2], 0
24387 pmaddubsw m5, m2, m6
24388 pmulhrsw m5, m7
24389 packuswb m4, m5
24390 movu [r0 + 1112 * 16], m4
24391 pslldq m1, 2
24392 pinsrw m1, [r3 + 6], 0
24393 pmaddubsw m4, m1, m6
24394 pmulhrsw m4, m7
24395 pslldq m3, 2
24396 pinsrw m3, [r3 + 14], 0
24397 pmaddubsw m5, m3, m6
24398 pmulhrsw m5, m7
24399 packuswb m4, m5
24400 movu [r0 + 1113 * 16], m4
24401
24402 ; mode 19 [row 13]
24403 movu m6, [r5 + 20 * 16]
24404 pslldq m0, 2
24405 pinsrb m0, [r4 + 12], 1
24406 pinsrb m0, [r4 + 14], 0
24407 pmaddubsw m4, m0, m6
24408 pmulhrsw m4, m7
24409 pslldq m2, 2
24410 pinsrb m2, [r4 + 2], 1
24411 pinsrb m2, [r4 + 4], 0
24412 pmaddubsw m5, m2, m6
24413 pmulhrsw m5, m7
24414 packuswb m4, m5
24415 movu [r0 + 1114 * 16], m4
24416 pslldq m1, 2
24417 pinsrw m1, [r3 + 5], 0
24418 pmaddubsw m4, m1, m6
24419 pmulhrsw m4, m7
24420 pslldq m3, 2
24421 pinsrw m3, [r3 + 13], 0
24422 pmaddubsw m5, m3, m6
24423 pmulhrsw m5, m7
24424 packuswb m4, m5
24425 movu [r0 + 1115 * 16], m4
24426
24427 ; mode 19 [row 14]
24428 movu m6, [r5 + 26 * 16]
24429 pslldq m0, 2
24430 pinsrb m0, [r4 + 14], 1
24431 pinsrb m0, [r4 + 15], 0
24432 pmaddubsw m4, m0, m6
24433 pmulhrsw m4, m7
24434 pslldq m2, 2
24435 pinsrb m2, [r4 + 4], 1
24436 pinsrb m2, [r4 + 5], 0
24437 pmaddubsw m5, m2, m6
24438 pmulhrsw m5, m7
24439 packuswb m4, m5
24440 movu [r0 + 1116 * 16], m4
24441 pslldq m1, 2
24442 pinsrw m1, [r3 + 4], 0
24443 pmaddubsw m4, m1, m6
24444 pmulhrsw m4, m7
24445 pslldq m3, 2
24446 pinsrw m3, [r3 + 12], 0
24447 pmaddubsw m5, m3, m6
24448 pmulhrsw m5, m7
24449 packuswb m4, m5
24450 movu [r0 + 1117 * 16], m4
24451
24452 ; mode19 [row 15]
24453 pshufb m5, m0, [tab_S2]
24454 movh [r0 + 1118 * 16], m5
24455 pshufb m5, m2, [tab_S2]
24456 movh [r0 + 1118 * 16 + 8], m5
24457 pshufb m5, m1, [tab_S2]
24458 movh [r0 + 1119 * 16], m5
24459 pshufb m5, m3, [tab_S2]
24460 movh [r0 + 1119 * 16 + 8], m5
24461
24462 ; mode 19 [row 16]
24463 movu m6, [r5 + 6 * 16]
24464 pslldq m0, 2
24465 pinsrb m0, [r4 + 15], 1
24466 pinsrb m0, [r4 + 16], 0
24467 pmaddubsw m4, m0, m6
24468 pmulhrsw m4, m7
24469 pslldq m2, 2
24470 pinsrb m2, [r4 + 5], 1
24471 pinsrb m2, [r4 + 6], 0
24472 pmaddubsw m5, m2, m6
24473 pmulhrsw m5, m7
24474 packuswb m4, m5
24475 movu [r0 + 1120 * 16], m4
24476 pslldq m1, 2
24477 pinsrw m1, [r3 + 3], 0
24478 pmaddubsw m4, m1, m6
24479 pmulhrsw m4, m7
24480 pslldq m3, 2
24481 pinsrw m3, [r3 + 11], 0
24482 pmaddubsw m5, m3, m6
24483 pmulhrsw m5, m7
24484 packuswb m4, m5
24485 movu [r0 + 1121 * 16], m4
24486
24487 ; mode 19 [row 17]
24488 movu m6, [r5 + 12 * 16]
24489 pslldq m0, 2
24490 pinsrb m0, [r4 + 16], 1
24491 pinsrb m0, [r4 + 17], 0
24492 pmaddubsw m4, m0, m6
24493 pmulhrsw m4, m7
24494 pslldq m2, 2
24495 pinsrb m2, [r4 + 6], 1
24496 pinsrb m2, [r4 + 7], 0
24497 pmaddubsw m5, m2, m6
24498 pmulhrsw m5, m7
24499 packuswb m4, m5
24500 movu [r0 + 1122 * 16], m4
24501 pslldq m1, 2
24502 pinsrw m1, [r3 + 2], 0
24503 pmaddubsw m4, m1, m6
24504 pmulhrsw m4, m7
24505 pslldq m3, 2
24506 pinsrw m3, [r3 + 10], 0
24507 pmaddubsw m5, m3, m6
24508 pmulhrsw m5, m7
24509 packuswb m4, m5
24510 movu [r0 + 1123 * 16], m4
24511
24512 ; mode 19 [row 18]
24513 movu m6, [r5 + 18 * 16]
24514 pslldq m0, 2
24515 pinsrb m0, [r4 + 17], 1
24516 pinsrb m0, [r4 + 18], 0
24517 pmaddubsw m4, m0, m6
24518 pmulhrsw m4, m7
24519 pslldq m2, 2
24520 pinsrb m2, [r4 + 7], 1
24521 pinsrb m2, [r4 + 9], 0
24522 pmaddubsw m5, m2, m6
24523 pmulhrsw m5, m7
24524 packuswb m4, m5
24525 movu [r0 + 1124 * 16], m4
24526 pslldq m1, 2
24527 pinsrw m1, [r3 + 1], 0
24528 pmaddubsw m4, m1, m6
24529 pmulhrsw m4, m7
24530 pslldq m3, 2
24531 pinsrw m3, [r3 + 9], 0
24532 pmaddubsw m5, m3, m6
24533 pmulhrsw m5, m7
24534 packuswb m4, m5
24535 movu [r0 + 1125 * 16], m4
24536
24537 ; mode 19 [row 19]
24538 movu m6, [r5 + 24 * 16]
24539 pslldq m0, 2
24540 pinsrb m0, [r4 + 18], 1
24541 pinsrb m0, [r4 + 20], 0
24542 pmaddubsw m4, m0, m6
24543 pmulhrsw m4, m7
24544 pslldq m2, 2
24545 pinsrb m2, [r4 + 9], 1
24546 pinsrb m2, [r4 + 10], 0
24547 pmaddubsw m5, m2, m6
24548 pmulhrsw m5, m7
24549 packuswb m4, m5
24550 movu [r0 + 1126 * 16], m4
24551 pslldq m1, 2
24552 pinsrw m1, [r3 + 0], 0
24553 pmaddubsw m4, m1, m6
24554 pmulhrsw m4, m7
24555 pslldq m3, 2
24556 pinsrw m3, [r3 + 8], 0
24557 pmaddubsw m5, m3, m6
24558 pmulhrsw m5, m7
24559 packuswb m4, m5
24560 movu [r0 + 1127 * 16], m4
24561
24562 ; mode 19 [row 20]
24563 movu m6, [r5 + 30 * 16]
24564 pslldq m0, 2
24565 pinsrb m0, [r4 + 20], 1
24566 pinsrb m0, [r4 + 21], 0
24567 pmaddubsw m4, m0, m6
24568 pmulhrsw m4, m7
24569 pslldq m2, 2
24570 pinsrb m2, [r4 + 10], 1
24571 pinsrb m2, [r4 + 11], 0
24572 pmaddubsw m5, m2, m6
24573 pmulhrsw m5, m7
24574 packuswb m4, m5
24575 movu [r0 + 1128 * 16], m4
24576 pslldq m1, 2
24577 pinsrb m1, [r4 + 0], 1
24578 pinsrb m1, [r4 + 1], 0
24579 pmaddubsw m4, m1, m6
24580 pmulhrsw m4, m7
24581 pslldq m3, 2
24582 pinsrb m3, [r3 + 8], 1
24583 pinsrb m3, [r3 + 7], 0
24584 pmaddubsw m5, m3, m6
24585 pmulhrsw m5, m7
24586 packuswb m4, m5
24587 movu [r0 + 1129 * 16], m4
24588
24589 ; mode 19 [row 21]
24590 movu m6, [r5 + 4 * 16]
24591 pmaddubsw m4, m0, m6
24592 pmulhrsw m4, m7
24593 pmaddubsw m5, m2, m6
24594 pmulhrsw m5, m7
24595 packuswb m4, m5
24596 movu [r0 + 1130 * 16], m4
24597 pmaddubsw m4, m1, m6
24598 pmulhrsw m4, m7
24599 pmaddubsw m5, m3, m6
24600 pmulhrsw m5, m7
24601 packuswb m4, m5
24602 movu [r0 + 1131 * 16], m4
24603
24604 ; mode 19 [row 22]
24605 movu m6, [r5 + 10 * 16]
24606 pslldq m0, 2
24607 pinsrb m0, [r4 + 21], 1
24608 pinsrb m0, [r4 + 22], 0
24609 pmaddubsw m4, m0, m6
24610 pmulhrsw m4, m7
24611 pslldq m2, 2
24612 pinsrb m2, [r4 + 11], 1
24613 pinsrb m2, [r4 + 12], 0
24614 pmaddubsw m5, m2, m6
24615 pmulhrsw m5, m7
24616 packuswb m4, m5
24617 movu [r0 + 1132 * 16], m4
24618 pslldq m1, 2
24619 pinsrb m1, [r4 + 1], 1
24620 pinsrb m1, [r4 + 2], 0
24621 pmaddubsw m4, m1, m6
24622 pmulhrsw m4, m7
24623 pslldq m3, 2
24624 pinsrw m3, [r3 + 6], 0
24625 pmaddubsw m5, m3, m6
24626 pmulhrsw m5, m7
24627 packuswb m4, m5
24628 movu [r0 + 1133 * 16], m4
24629
24630 ; mode 19 [row 23]
24631 movu m6, [r5 + 16 * 16]
24632 pslldq m0, 2
24633 pinsrb m0, [r4 + 22], 1
24634 pinsrb m0, [r4 + 23], 0
24635 pmaddubsw m4, m0, m6
24636 pmulhrsw m4, m7
24637 pslldq m2, 2
24638 pinsrb m2, [r4 + 12], 1
24639 pinsrb m2, [r4 + 14], 0
24640 pmaddubsw m5, m2, m6
24641 pmulhrsw m5, m7
24642 packuswb m4, m5
24643 movu [r0 + 1134 * 16], m4
24644 pslldq m1, 2
24645 pinsrb m1, [r4 + 2], 1
24646 pinsrb m1, [r4 + 4], 0
24647 pmaddubsw m4, m1, m6
24648 pmulhrsw m4, m7
24649 pslldq m3, 2
24650 pinsrw m3, [r3 + 5], 0
24651 pmaddubsw m5, m3, m6
24652 pmulhrsw m5, m7
24653 packuswb m4, m5
24654 movu [r0 + 1135 * 16], m4
24655
24656 ; mode 19 [row 24]
24657 movu m6, [r5 + 22 * 16]
24658 pslldq m0, 2
24659 pinsrb m0, [r4 + 23], 1
24660 pinsrb m0, [r4 + 25], 0
24661 pmaddubsw m4, m0, m6
24662 pmulhrsw m4, m7
24663 pslldq m2, 2
24664 pinsrb m2, [r4 + 14], 1
24665 pinsrb m2, [r4 + 15], 0
24666 pmaddubsw m5, m2, m6
24667 pmulhrsw m5, m7
24668 packuswb m4, m5
24669 movu [r0 + 1136 * 16], m4
24670 pslldq m1, 2
24671 pinsrb m1, [r4 + 4], 1
24672 pinsrb m1, [r4 + 5], 0
24673 pmaddubsw m4, m1, m6
24674 pmulhrsw m4, m7
24675 pslldq m3, 2
24676 pinsrw m3, [r3 + 4], 0
24677 pmaddubsw m5, m3, m6
24678 pmulhrsw m5, m7
24679 packuswb m4, m5
24680 movu [r0 + 1137 * 16], m4
24681
24682 ; mode 19 [row 25]
24683 movu m6, [r5 + 28 * 16]
24684 pslldq m0, 2
24685 pinsrb m0, [r4 + 25], 1
24686 pinsrb m0, [r4 + 26], 0
24687 pmaddubsw m4, m0, m6
24688 pmulhrsw m4, m7
24689 pslldq m2, 2
24690 pinsrb m2, [r4 + 15], 1
24691 pinsrb m2, [r4 + 16], 0
24692 pmaddubsw m5, m2, m6
24693 pmulhrsw m5, m7
24694 packuswb m4, m5
24695 movu [r0 + 1138 * 16], m4
24696 pslldq m1, 2
24697 pinsrb m1, [r4 + 5], 1
24698 pinsrb m1, [r4 + 6], 0
24699 pmaddubsw m4, m1, m6
24700 pmulhrsw m4, m7
24701 pslldq m3, 2
24702 pinsrw m3, [r3 + 3], 0
24703 pmaddubsw m5, m3, m6
24704 pmulhrsw m5, m7
24705 packuswb m4, m5
24706 movu [r0 + 1139 * 16], m4
24707
24708 ; mode 19 [row 26]
24709 movu m6, [r5 + 2 * 16]
24710 pmaddubsw m4, m0, m6
24711 pmulhrsw m4, m7
24712 pmaddubsw m5, m2, m6
24713 pmulhrsw m5, m7
24714 packuswb m4, m5
24715 movu [r0 + 1140 * 16], m4
24716 pmaddubsw m4, m1, m6
24717 pmulhrsw m4, m7
24718 pmaddubsw m5, m3, m6
24719 pmulhrsw m5, m7
24720 packuswb m4, m5
24721 movu [r0 + 1141 * 16], m4
24722
24723 ; mode 19 [row 27]
24724 movu m6, [r5 + 8 * 16]
24725 pslldq m0, 2
24726 pinsrb m0, [r4 + 26], 1
24727 pinsrb m0, [r4 + 27], 0
24728 pmaddubsw m4, m0, m6
24729 pmulhrsw m4, m7
24730 pslldq m2, 2
24731 pinsrb m2, [r4 + 16], 1
24732 pinsrb m2, [r4 + 17], 0
24733 pmaddubsw m5, m2, m6
24734 pmulhrsw m5, m7
24735 packuswb m4, m5
24736 movu [r0 + 1142 * 16], m4
24737 pslldq m1, 2
24738 pinsrb m1, [r4 + 6], 1
24739 pinsrb m1, [r4 + 7], 0
24740 pmaddubsw m4, m1, m6
24741 pmulhrsw m4, m7
24742 pslldq m3, 2
24743 pinsrw m3, [r3 + 2], 0
24744 pmaddubsw m5, m3, m6
24745 pmulhrsw m5, m7
24746 packuswb m4, m5
24747 movu [r0 + 1143 * 16], m4
24748
24749 ; mode 19 [row 28]
24750 movu m6, [r5 + 14 * 16]
24751 pslldq m0, 2
24752 pinsrb m0, [r4 + 27], 1
24753 pinsrb m0, [r4 + 28], 0
24754 pmaddubsw m4, m0, m6
24755 pmulhrsw m4, m7
24756 pslldq m2, 2
24757 pinsrb m2, [r4 + 17], 1
24758 pinsrb m2, [r4 + 18], 0
24759 pmaddubsw m5, m2, m6
24760 pmulhrsw m5, m7
24761 packuswb m4, m5
24762 movu [r0 + 1144 * 16], m4
24763 pslldq m1, 2
24764 pinsrb m1, [r4 + 7], 1
24765 pinsrb m1, [r4 + 9], 0
24766 pmaddubsw m4, m1, m6
24767 pmulhrsw m4, m7
24768 pslldq m3, 2
24769 pinsrw m3, [r3 + 1], 0
24770 pmaddubsw m5, m3, m6
24771 pmulhrsw m5, m7
24772 packuswb m4, m5
24773 movu [r0 + 1145 * 16], m4
24774
24775 ; mode 19 [row 29]
24776 movu m6, [r5 + 20 * 16]
24777 pslldq m0, 2
24778 pinsrb m0, [r4 + 28], 1
24779 pinsrb m0, [r4 + 30], 0
24780 pmaddubsw m4, m0, m6
24781 pmulhrsw m4, m7
24782 pslldq m2, 2
24783 pinsrb m2, [r4 + 18], 1
24784 pinsrb m2, [r4 + 20], 0
24785 pmaddubsw m5, m2, m6
24786 pmulhrsw m5, m7
24787 packuswb m4, m5
24788 movu [r0 + 1146 * 16], m4
24789 pslldq m1, 2
24790 pinsrb m1, [r4 + 9], 1
24791 pinsrb m1, [r4 + 10], 0
24792 pmaddubsw m4, m1, m6
24793 pmulhrsw m4, m7
24794 pslldq m3, 2
24795 pinsrw m3, [r3 + 0], 0
24796 pmaddubsw m5, m3, m6
24797 pmulhrsw m5, m7
24798 packuswb m4, m5
24799 movu [r0 + 1147 * 16], m4
24800
24801 ; mode 19 [row 30]
24802 movu m6, [r5 + 26 * 16]
24803 pslldq m0, 2
24804 pinsrb m0, [r4 + 30], 1
24805 pinsrb m0, [r4 + 31], 0
24806 pmaddubsw m4, m0, m6
24807 pmulhrsw m4, m7
24808 pslldq m2, 2
24809 pinsrb m2, [r4 + 20], 1
24810 pinsrb m2, [r4 + 21], 0
24811 pmaddubsw m5, m2, m6
24812 pmulhrsw m5, m7
24813 packuswb m4, m5
24814 movu [r0 + 1148 * 16], m4
24815 pslldq m1, 2
24816 pinsrb m1, [r4 + 10], 1
24817 pinsrb m1, [r4 + 11], 0
24818 pmaddubsw m4, m1, m6
24819 pmulhrsw m4, m7
24820 pslldq m3, 2
24821 pinsrb m3, [r4 + 0], 1
24822 pinsrb m3, [r4 + 1], 0
24823 pmaddubsw m5, m3, m6
24824 pmulhrsw m5, m7
24825 packuswb m4, m5
24826 movu [r0 + 1149 * 16], m4
24827
24828 ; mode19 [row 31]
24829 pshufb m5, m0, [tab_S2]
24830 movh [r0 + 1150 * 16], m5
24831 pshufb m5, m2, [tab_S2]
24832 movh [r0 + 1150 * 16 + 8], m5
24833 pshufb m5, m1, [tab_S2]
24834 movh [r0 + 1151 * 16], m5
24835 pshufb m5, m3, [tab_S2]
24836 movh [r0 + 1151 * 16 + 8], m5
24837
24838 ; mode 20 [row 0]
24839 movu m6, [r5 + 11 * 16]
24840 movu m0, [r3 ]
24841 movu m1, [r3 + 1 ]
24842 punpcklbw m0, m1
24843 pmaddubsw m1, m0, m6
24844 pmulhrsw m1, m7
24845 movu m2, [r3 + 8]
24846 movu m3, [r3 + 9]
24847 punpcklbw m2, m3
24848 pmaddubsw m3, m2, m6
24849 pmulhrsw m3, m7
24850 packuswb m1, m3
24851 movu [r0 + 1152 * 16], m1
24852
24853 movu m1, [r3 + 16]
24854 movu m3, [r3 + 17]
24855 punpcklbw m1, m3
24856 pmaddubsw m4, m1, m6
24857 pmulhrsw m4, m7
24858 movu m3, [r3 + 24]
24859 movu m5, [r3 + 25]
24860 punpcklbw m3, m5
24861 pmaddubsw m5, m3, m6
24862 pmulhrsw m5, m7
24863 packuswb m4, m5
24864 movu [r0 + 1153 * 16], m4
24865
24866 ; mode 20 [row 1]
24867 movu m6, [r5 + 22 * 16]
24868 pslldq m0, 2
24869 pinsrb m0, [r4 + 0], 1
24870 pinsrb m0, [r4 + 2], 0
24871 pmaddubsw m4, m0, m6
24872 pmulhrsw m4, m7
24873 pslldq m2, 2
24874 pinsrw m2, [r3 + 7], 0
24875 pmaddubsw m5, m2, m6
24876 pmulhrsw m5, m7
24877 packuswb m4, m5
24878 movu [r0 + 1154 * 16], m4
24879 pslldq m1, 2
24880 pinsrw m1, [r3 + 15], 0
24881 pmaddubsw m4, m1, m6
24882 pmulhrsw m4, m7
24883 pslldq m3, 2
24884 pinsrw m3, [r3 + 23], 0
24885 pmaddubsw m5, m3, m6
24886 pmulhrsw m5, m7
24887 packuswb m4, m5
24888 movu [r0 + 1155 * 16], m4
24889
24890 ; mode 20 [row 2]
24891 movu m6, [r5 + 1 * 16]
24892 pmaddubsw m4, m0, m6
24893 pmulhrsw m4, m7
24894 pmaddubsw m5, m2, m6
24895 pmulhrsw m5, m7
24896 packuswb m4, m5
24897 movu [r0 + 1156 * 16], m4
24898 pmaddubsw m4, m1, m6
24899 pmulhrsw m4, m7
24900 pmaddubsw m5, m3, m6
24901 pmulhrsw m5, m7
24902 packuswb m4, m5
24903 movu [r0 + 1157 * 16], m4
24904
24905 ; mode 20 [row 3]
24906 movu m6, [r5 + 12 * 16]
24907 pslldq m0, 2
24908 pinsrb m0, [r4 + 2], 1
24909 pinsrb m0, [r4 + 3], 0
24910 pmaddubsw m4, m0, m6
24911 pmulhrsw m4, m7
24912 pslldq m2, 2
24913 pinsrw m2, [r3 + 6], 0
24914 pmaddubsw m5, m2, m6
24915 pmulhrsw m5, m7
24916 packuswb m4, m5
24917 movu [r0 + 1158 * 16], m4
24918 pslldq m1, 2
24919 pinsrw m1, [r3 + 14], 0
24920 pmaddubsw m4, m1, m6
24921 pmulhrsw m4, m7
24922 pslldq m3, 2
24923 pinsrw m3, [r3 + 22], 0
24924 pmaddubsw m5, m3, m6
24925 pmulhrsw m5, m7
24926 packuswb m4, m5
24927 movu [r0 + 1159 * 16], m4
24928
24929 ; mode 20 [row 4]
24930 movu m6, [r5 + 23 * 16]
24931 pslldq m0, 2
24932 pinsrb m0, [r4 + 3], 1
24933 pinsrb m0, [r4 + 5], 0
24934 pmaddubsw m4, m0, m6
24935 pmulhrsw m4, m7
24936 pslldq m2, 2
24937 pinsrw m2, [r3 + 5], 0
24938 pmaddubsw m5, m2, m6
24939 pmulhrsw m5, m7
24940 packuswb m4, m5
24941 movu [r0 + 1160 * 16], m4
24942 pslldq m1, 2
24943 pinsrw m1, [r3 + 13], 0
24944 pmaddubsw m4, m1, m6
24945 pmulhrsw m4, m7
24946 pslldq m3, 2
24947 pinsrw m3, [r3 + 21], 0
24948 pmaddubsw m5, m3, m6
24949 pmulhrsw m5, m7
24950 packuswb m4, m5
24951 movu [r0 + 1161 * 16], m4
24952
24953 ; mode 20 [row 5]
24954 movu m6, [r5 + 2 * 16]
24955 pmaddubsw m4, m0, m6
24956 pmulhrsw m4, m7
24957 pmaddubsw m5, m2, m6
24958 pmulhrsw m5, m7
24959 packuswb m4, m5
24960 movu [r0 + 1162 * 16], m4
24961 pmaddubsw m4, m1, m6
24962 pmulhrsw m4, m7
24963 pmaddubsw m5, m3, m6
24964 pmulhrsw m5, m7
24965 packuswb m4, m5
24966 movu [r0 + 1163 * 16], m4
24967
24968 ; mode 20 [row 6]
24969 movu m6, [r5 + 13 * 16]
24970 pslldq m0, 2
24971 pinsrb m0, [r4 + 5], 1
24972 pinsrb m0, [r4 + 6], 0
24973 pmaddubsw m4, m0, m6
24974 pmulhrsw m4, m7
24975 pslldq m2, 2
24976 pinsrw m2, [r3 + 4], 0
24977 pmaddubsw m5, m2, m6
24978 pmulhrsw m5, m7
24979 packuswb m4, m5
24980 movu [r0 + 1164 * 16], m4
24981 pslldq m1, 2
24982 pinsrw m1, [r3 + 12], 0
24983 pmaddubsw m4, m1, m6
24984 pmulhrsw m4, m7
24985 pslldq m3, 2
24986 pinsrw m3, [r3 + 20], 0
24987 pmaddubsw m5, m3, m6
24988 pmulhrsw m5, m7
24989 packuswb m4, m5
24990 movu [r0 + 1165 * 16], m4
24991
24992 ; mode 20 [row 7]
24993 movu m6, [r5 + 24 * 16]
24994 pslldq m0, 2
24995 pinsrb m0, [r4 + 6], 1
24996 pinsrb m0, [r4 + 8], 0
24997 pmaddubsw m4, m0, m6
24998 pmulhrsw m4, m7
24999 pslldq m2, 2
25000 pinsrw m2, [r3 + 3], 0
25001 pmaddubsw m5, m2, m6
25002 pmulhrsw m5, m7
25003 packuswb m4, m5
25004 movu [r0 + 1166 * 16], m4
25005 pslldq m1, 2
25006 pinsrw m1, [r3 + 11], 0
25007 pmaddubsw m4, m1, m6
25008 pmulhrsw m4, m7
25009 pslldq m3, 2
25010 pinsrw m3, [r3 + 19], 0
25011 pmaddubsw m5, m3, m6
25012 pmulhrsw m5, m7
25013 packuswb m4, m5
25014 movu [r0 + 1167 * 16], m4
25015
25016 ; mode 20 [row 8]
25017 movu m6, [r5 + 3 * 16]
25018 pmaddubsw m4, m0, m6
25019 pmulhrsw m4, m7
25020 pmaddubsw m5, m2, m6
25021 pmulhrsw m5, m7
25022 packuswb m4, m5
25023 movu [r0 + 1168 * 16], m4
25024 pmaddubsw m4, m1, m6
25025 pmulhrsw m4, m7
25026 pmaddubsw m5, m3, m6
25027 pmulhrsw m5, m7
25028 packuswb m4, m5
25029 movu [r0 + 1169 * 16], m4
25030
25031 ; mode 20 [row 9]
25032 movu m6, [r5 + 14 * 16]
25033 pslldq m0, 2
25034 pinsrb m0, [r4 + 8], 1
25035 pinsrb m0, [r4 + 9], 0
25036 pmaddubsw m4, m0, m6
25037 pmulhrsw m4, m7
25038 pslldq m2, 2
25039 pinsrb m2, [r3 + 3], 1
25040 pinsrb m2, [r3 + 2], 0
25041 pmaddubsw m5, m2, m6
25042 pmulhrsw m5, m7
25043 packuswb m4, m5
25044 movu [r0 + 1170 * 16], m4
25045 pslldq m1, 2
25046 pinsrw m1, [r3 + 10], 0
25047 pmaddubsw m4, m1, m6
25048 pmulhrsw m4, m7
25049 pslldq m3, 2
25050 pinsrw m3, [r3 + 18], 0
25051 pmaddubsw m5, m3, m6
25052 pmulhrsw m5, m7
25053 packuswb m4, m5
25054 movu [r0 + 1171 * 16], m4
25055
25056 ; mode 20 [row 10]
25057 movu m6, [r5 + 25 * 16]
25058 pslldq m0, 2
25059 pinsrb m0, [r4 + 9], 1
25060 pinsrb m0, [r4 + 11], 0
25061 pmaddubsw m4, m0, m6
25062 pmulhrsw m4, m7
25063 pslldq m2, 2
25064 pinsrw m2, [r3 + 1], 0
25065 pmaddubsw m5, m2, m6
25066 pmulhrsw m5, m7
25067 packuswb m4, m5
25068 movu [r0 + 1172 * 16], m4
25069 pslldq m1, 2
25070 pinsrw m1, [r3 + 9], 0
25071 pmaddubsw m4, m1, m6
25072 pmulhrsw m4, m7
25073 pslldq m3, 2
25074 pinsrw m3, [r3 + 17], 0
25075 pmaddubsw m5, m3, m6
25076 pmulhrsw m5, m7
25077 packuswb m4, m5
25078 movu [r0 + 1173 * 16], m4
25079
25080 ; mode 20 [row 11]
25081 movu m6, [r5 + 4 * 16]
25082 pmaddubsw m4, m0, m6
25083 pmulhrsw m4, m7
25084 pmaddubsw m5, m2, m6
25085 pmulhrsw m5, m7
25086 packuswb m4, m5
25087 movu [r0 + 1174 * 16], m4
25088 pmaddubsw m4, m1, m6
25089 pmulhrsw m4, m7
25090 pmaddubsw m5, m3, m6
25091 pmulhrsw m5, m7
25092 packuswb m4, m5
25093 movu [r0 + 1175 * 16], m4
25094
25095 ; mode 20 [row 12]
25096 movu m6, [r5 + 15 * 16]
25097 pslldq m0, 2
25098 pinsrb m0, [r4 + 11], 1
25099 pinsrb m0, [r4 + 12], 0
25100 pmaddubsw m4, m0, m6
25101 pmulhrsw m4, m7
25102 pslldq m2, 2
25103 pinsrb m2, [r3 + 1], 1
25104 pinsrb m2, [r3 + 0], 0
25105 pmaddubsw m5, m2, m6
25106 pmulhrsw m5, m7
25107 packuswb m4, m5
25108 movu [r0 + 1176 * 16], m4
25109 pslldq m1, 2
25110 pinsrw m1, [r3 + 8], 0
25111 pmaddubsw m4, m1, m6
25112 pmulhrsw m4, m7
25113 pslldq m3, 2
25114 pinsrw m3, [r3 + 16], 0
25115 pmaddubsw m5, m3, m6
25116 pmulhrsw m5, m7
25117 packuswb m4, m5
25118 movu [r0 + 1177 * 16], m4
25119
25120 ; mode 20 [row 13]
25121 movu m6, [r5 + 26 * 16]
25122 pslldq m0, 2
25123 pinsrb m0, [r4 + 12], 1
25124 pinsrb m0, [r4 + 14], 0
25125 pmaddubsw m4, m0, m6
25126 pmulhrsw m4, m7
25127 pslldq m2, 2
25128 pinsrb m2, [r4 + 0], 1
25129 pinsrb m2, [r4 + 2], 0
25130 pmaddubsw m5, m2, m6
25131 pmulhrsw m5, m7
25132 packuswb m4, m5
25133 movu [r0 + 1178 * 16], m4
25134 pslldq m1, 2
25135 pinsrw m1, [r3 + 7], 0
25136 pmaddubsw m4, m1, m6
25137 pmulhrsw m4, m7
25138 pslldq m3, 2
25139 pinsrw m3, [r3 + 15], 0
25140 pmaddubsw m5, m3, m6
25141 pmulhrsw m5, m7
25142 packuswb m4, m5
25143 movu [r0 + 1179 * 16], m4
25144
25145 ; mode 20 [row 14]
25146 movu m6, [r5 + 5 * 16]
25147 pmaddubsw m4, m0, m6
25148 pmulhrsw m4, m7
25149 pmaddubsw m5, m2, m6
25150 pmulhrsw m5, m7
25151 packuswb m4, m5
25152 movu [r0 + 1180 * 16], m4
25153 pmaddubsw m4, m1, m6
25154 pmulhrsw m4, m7
25155 pmaddubsw m5, m3, m6
25156 pmulhrsw m5, m7
25157 packuswb m4, m5
25158 movu [r0 + 1181 * 16], m4
25159
25160 ; mode 20 [row 15]
25161 movu m6, [r5 + 16 * 16]
25162 pslldq m0, 2
25163 pinsrb m0, [r4 + 14], 1
25164 pinsrb m0, [r4 + 15], 0
25165 pmaddubsw m4, m0, m6
25166 pmulhrsw m4, m7
25167 pslldq m2, 2
25168 pinsrb m2, [r4 + 2], 1
25169 pinsrb m2, [r4 + 3], 0
25170 pmaddubsw m5, m2, m6
25171 pmulhrsw m5, m7
25172 packuswb m4, m5
25173 movu [r0 + 1182 * 16], m4
25174 pslldq m1, 2
25175 pinsrw m1, [r3 + 6], 0
25176 pmaddubsw m4, m1, m6
25177 pmulhrsw m4, m7
25178 pslldq m3, 2
25179 pinsrw m3, [r3 + 14], 0
25180 pmaddubsw m5, m3, m6
25181 pmulhrsw m5, m7
25182 packuswb m4, m5
25183 movu [r0 + 1183 * 16], m4
25184
25185 ; mode 20 [row 16]
25186 movu m6, [r5 + 27 * 16]
25187 pslldq m0, 2
25188 pinsrb m0, [r4 + 15], 1
25189 pinsrb m0, [r4 + 17], 0
25190 pmaddubsw m4, m0, m6
25191 pmulhrsw m4, m7
25192 pslldq m2, 2
25193 pinsrb m2, [r4 + 3], 1
25194 pinsrb m2, [r4 + 5], 0
25195 pmaddubsw m5, m2, m6
25196 pmulhrsw m5, m7
25197 packuswb m4, m5
25198 movu [r0 + 1184 * 16], m4
25199 pslldq m1, 2
25200 pinsrw m1, [r3 + 5], 0
25201 pmaddubsw m4, m1, m6
25202 pmulhrsw m4, m7
25203 pslldq m3, 2
25204 pinsrw m3, [r3 + 13], 0
25205 pmaddubsw m5, m3, m6
25206 pmulhrsw m5, m7
25207 packuswb m4, m5
25208 movu [r0 + 1185 * 16], m4
25209
25210 ; mode 20 [row 17]
25211 movu m6, [r5 + 6 * 16]
25212 pmaddubsw m4, m0, m6
25213 pmulhrsw m4, m7
25214 pmaddubsw m5, m2, m6
25215 pmulhrsw m5, m7
25216 packuswb m4, m5
25217 movu [r0 + 1186 * 16], m4
25218 pmaddubsw m4, m1, m6
25219 pmulhrsw m4, m7
25220 pmaddubsw m5, m3, m6
25221 pmulhrsw m5, m7
25222 packuswb m4, m5
25223 movu [r0 + 1187 * 16], m4
25224
25225 ; mode 20 [row 18]
25226 movu m6, [r5 + 17 * 16]
25227 pslldq m0, 2
25228 pinsrb m0, [r4 + 17], 1
25229 pinsrb m0, [r4 + 18], 0
25230 pmaddubsw m4, m0, m6
25231 pmulhrsw m4, m7
25232 pslldq m2, 2
25233 pinsrb m2, [r4 + 5], 1
25234 pinsrb m2, [r4 + 6], 0
25235 pmaddubsw m5, m2, m6
25236 pmulhrsw m5, m7
25237 packuswb m4, m5
25238 movu [r0 + 1188 * 16], m4
25239 pslldq m1, 2
25240 pinsrw m1, [r3 + 4], 0
25241 pmaddubsw m4, m1, m6
25242 pmulhrsw m4, m7
25243 pslldq m3, 2
25244 pinsrw m3, [r3 + 12], 0
25245 pmaddubsw m5, m3, m6
25246 pmulhrsw m5, m7
25247 packuswb m4, m5
25248 movu [r0 + 1189 * 16], m4
25249
25250 ; mode 20 [row 19]
25251 movu m6, [r5 + 28 * 16]
25252 pslldq m0, 2
25253 pinsrb m0, [r4 + 18], 1
25254 pinsrb m0, [r4 + 20], 0
25255 pmaddubsw m4, m0, m6
25256 pmulhrsw m4, m7
25257 pslldq m2, 2
25258 pinsrb m2, [r4 + 6], 1
25259 pinsrb m2, [r4 + 8], 0
25260 pmaddubsw m5, m2, m6
25261 pmulhrsw m5, m7
25262 packuswb m4, m5
25263 movu [r0 + 1190 * 16], m4
25264 pslldq m1, 2
25265 pinsrw m1, [r3 + 3], 0
25266 pmaddubsw m4, m1, m6
25267 pmulhrsw m4, m7
25268 pslldq m3, 2
25269 pinsrw m3, [r3 + 11], 0
25270 pmaddubsw m5, m3, m6
25271 pmulhrsw m5, m7
25272 packuswb m4, m5
25273 movu [r0 + 1191 * 16], m4
25274
25275 ; mode 20 [row 20]
25276 movu m6, [r5 + 7 * 16]
25277 pmaddubsw m4, m0, m6
25278 pmulhrsw m4, m7
25279 pmaddubsw m5, m2, m6
25280 pmulhrsw m5, m7
25281 packuswb m4, m5
25282 movu [r0 + 1192 * 16], m4
25283 pmaddubsw m4, m1, m6
25284 pmulhrsw m4, m7
25285 pmaddubsw m5, m3, m6
25286 pmulhrsw m5, m7
25287 packuswb m4, m5
25288 movu [r0 + 1193 * 16], m4
25289
25290 ; mode 20 [row 21]
25291 movu m6, [r5 + 18 * 16]
25292 pslldq m0, 2
25293 pinsrb m0, [r4 + 20], 1
25294 pinsrb m0, [r4 + 21], 0
25295 pmaddubsw m4, m0, m6
25296 pmulhrsw m4, m7
25297 pslldq m2, 2
25298 pinsrb m2, [r4 + 8], 1
25299 pinsrb m2, [r4 + 9], 0
25300 pmaddubsw m5, m2, m6
25301 pmulhrsw m5, m7
25302 packuswb m4, m5
25303 movu [r0 + 1194 * 16], m4
25304 pslldq m1, 2
25305 pinsrw m1, [r3 + 2], 0
25306 pmaddubsw m4, m1, m6
25307 pmulhrsw m4, m7
25308 pslldq m3, 2
25309 pinsrw m3, [r3 + 10], 0
25310 pmaddubsw m5, m3, m6
25311 pmulhrsw m5, m7
25312 packuswb m4, m5
25313 movu [r0 + 1195 * 16], m4
25314
25315 ; mode 20 [row 22]
25316 movu m6, [r5 + 29 * 16]
25317 pslldq m0, 2
25318 pinsrb m0, [r4 + 21], 1
25319 pinsrb m0, [r4 + 23], 0
25320 pmaddubsw m4, m0, m6
25321 pmulhrsw m4, m7
25322 pslldq m2, 2
25323 pinsrb m2, [r4 + 9], 1
25324 pinsrb m2, [r4 + 11], 0
25325 pmaddubsw m5, m2, m6
25326 pmulhrsw m5, m7
25327 packuswb m4, m5
25328 movu [r0 + 1196 * 16], m4
25329 pslldq m1, 2
25330 pinsrw m1, [r3 + 1], 0
25331 pmaddubsw m4, m1, m6
25332 pmulhrsw m4, m7
25333 pslldq m3, 2
25334 pinsrw m3, [r3 + 9], 0
25335 pmaddubsw m5, m3, m6
25336 pmulhrsw m5, m7
25337 packuswb m4, m5
25338 movu [r0 + 1197 * 16], m4
25339
25340 ; mode 20 [row 23]
25341 movu m6, [r5 + 8 * 16]
25342 pmaddubsw m4, m0, m6
25343 pmulhrsw m4, m7
25344 pmaddubsw m5, m2, m6
25345 pmulhrsw m5, m7
25346 packuswb m4, m5
25347 movu [r0 + 1198 * 16], m4
25348 pmaddubsw m4, m1, m6
25349 pmulhrsw m4, m7
25350 pmaddubsw m5, m3, m6
25351 pmulhrsw m5, m7
25352 packuswb m4, m5
25353 movu [r0 + 1199 * 16], m4
25354
25355 ; mode 20 [row 24]
25356 movu m6, [r5 + 19 * 16]
25357 pslldq m0, 2
25358 pinsrb m0, [r4 + 23], 1
25359 pinsrb m0, [r4 + 24], 0
25360 pmaddubsw m4, m0, m6
25361 pmulhrsw m4, m7
25362 pslldq m2, 2
25363 pinsrb m2, [r4 + 11], 1
25364 pinsrb m2, [r4 + 12], 0
25365 pmaddubsw m5, m2, m6
25366 pmulhrsw m5, m7
25367 packuswb m4, m5
25368 movu [r0 + 1200 * 16], m4
25369 pslldq m1, 2
25370 pinsrw m1, [r3 + 0], 0
25371 pmaddubsw m4, m1, m6
25372 pmulhrsw m4, m7
25373 pslldq m3, 2
25374 pinsrw m3, [r3 + 8], 0
25375 pmaddubsw m5, m3, m6
25376 pmulhrsw m5, m7
25377 packuswb m4, m5
25378 movu [r0 + 1201 * 16], m4
25379
25380 ; mode 20 [row 25]
25381 movu m6, [r5 + 30 * 16]
25382 pslldq m0, 2
25383 pinsrb m0, [r4 + 24], 1
25384 pinsrb m0, [r4 + 26], 0
25385 pmaddubsw m4, m0, m6
25386 pmulhrsw m4, m7
25387 pslldq m2, 2
25388 pinsrb m2, [r4 + 12], 1
25389 pinsrb m2, [r4 + 14], 0
25390 pmaddubsw m5, m2, m6
25391 pmulhrsw m5, m7
25392 packuswb m4, m5
25393 movu [r0 + 1202 * 16], m4
25394 pslldq m1, 2
25395 pinsrb m1, [r4 + 0], 1
25396 pinsrb m1, [r4 + 2], 0
25397 pmaddubsw m4, m1, m6
25398 pmulhrsw m4, m7
25399 pslldq m3, 2
25400 pinsrw m3, [r3 + 7], 0
25401 pmaddubsw m5, m3, m6
25402 pmulhrsw m5, m7
25403 packuswb m4, m5
25404 movu [r0 + 1203 * 16], m4
25405
25406 ; mode 20 [row 26]
25407 movu m6, [r5 + 9 * 16]
25408 pmaddubsw m4, m0, m6
25409 pmulhrsw m4, m7
25410 pmaddubsw m5, m2, m6
25411 pmulhrsw m5, m7
25412 packuswb m4, m5
25413 movu [r0 + 1204 * 16], m4
25414 pmaddubsw m4, m1, m6
25415 pmulhrsw m4, m7
25416 pmaddubsw m5, m3, m6
25417 pmulhrsw m5, m7
25418 packuswb m4, m5
25419 movu [r0 + 1205 * 16], m4
25420
25421 ; mode 20 [row 27]
25422 movu m6, [r5 + 20 * 16]
25423 pslldq m0, 2
25424 pinsrb m0, [r4 + 26], 1
25425 pinsrb m0, [r4 + 27], 0
25426 pmaddubsw m4, m0, m6
25427 pmulhrsw m4, m7
25428 pslldq m2, 2
25429 pinsrb m2, [r4 + 14], 1
25430 pinsrb m2, [r4 + 15], 0
25431 pmaddubsw m5, m2, m6
25432 pmulhrsw m5, m7
25433 packuswb m4, m5
25434 movu [r0 + 1206 * 16], m4
25435 pslldq m1, 2
25436 pinsrb m1, [r4 + 2], 1
25437 pinsrb m1, [r4 + 3], 0
25438 pmaddubsw m4, m1, m6
25439 pmulhrsw m4, m7
25440 pslldq m3, 2
25441 pinsrw m3, [r3 + 6], 0
25442 pmaddubsw m5, m3, m6
25443 pmulhrsw m5, m7
25444 packuswb m4, m5
25445 movu [r0 + 1207 * 16], m4
25446
25447 ; mode 20 [row 28]
25448 movu m6, [r5 + 31 * 16]
25449 pslldq m0, 2
25450 pinsrb m0, [r4 + 27], 1
25451 pinsrb m0, [r4 + 29], 0
25452 pmaddubsw m4, m0, m6
25453 pmulhrsw m4, m7
25454 pslldq m2, 2
25455 pinsrb m2, [r4 + 15], 1
25456 pinsrb m2, [r4 + 17], 0
25457 pmaddubsw m5, m2, m6
25458 pmulhrsw m5, m7
25459 packuswb m4, m5
25460 movu [r0 + 1208 * 16], m4
25461 pslldq m1, 2
25462 pinsrb m1, [r4 + 3], 1
25463 pinsrb m1, [r4 + 5], 0
25464 pmaddubsw m4, m1, m6
25465 pmulhrsw m4, m7
25466 pslldq m3, 2
25467 pinsrw m3, [r3 + 5], 0
25468 pmaddubsw m5, m3, m6
25469 pmulhrsw m5, m7
25470 packuswb m4, m5
25471 movu [r0 + 1209 * 16], m4
25472
25473 ; mode 20 [row 29]
25474 movu m6, [r5 + 10 * 16]
25475 pmaddubsw m4, m0, m6
25476 pmulhrsw m4, m7
25477 pmaddubsw m5, m2, m6
25478 pmulhrsw m5, m7
25479 packuswb m4, m5
25480 movu [r0 + 1210 * 16], m4
25481 pmaddubsw m4, m1, m6
25482 pmulhrsw m4, m7
25483 pmaddubsw m5, m3, m6
25484 pmulhrsw m5, m7
25485 packuswb m4, m5
25486 movu [r0 + 1211 * 16], m4
25487
25488 ; mode 20 [row 30]
25489 movu m6, [r5 + 21 * 16]
25490 pslldq m0, 2
25491 pinsrb m0, [r4 + 29], 1
25492 pinsrb m0, [r4 + 30], 0
25493 pmaddubsw m4, m0, m6
25494 pmulhrsw m4, m7
25495 pslldq m2, 2
25496 pinsrb m2, [r4 + 17], 1
25497 pinsrb m2, [r4 + 18], 0
25498 pmaddubsw m5, m2, m6
25499 pmulhrsw m5, m7
25500 packuswb m4, m5
25501 movu [r0 + 1212 * 16], m4
25502 pslldq m1, 2
25503 pinsrb m1, [r4 + 5], 1
25504 pinsrb m1, [r4 + 6], 0
25505 pmaddubsw m4, m1, m6
25506 pmulhrsw m4, m7
25507 pslldq m3, 2
25508 pinsrw m3, [r3 + 4], 0
25509 pmaddubsw m5, m3, m6
25510 pmulhrsw m5, m7
25511 packuswb m4, m5
25512 movu [r0 + 1213 * 16], m4
25513
25514 ; mode20 [row 31]
25515 pshufb m5, m0, [tab_S2]
25516 movh [r0 + 1214 * 16], m5
25517 pshufb m5, m2, [tab_S2]
25518 movh [r0 + 1214 * 16 + 8], m5
25519 pshufb m5, m1, [tab_S2]
25520 movh [r0 + 1215 * 16], m5
25521 pshufb m5, m3, [tab_S2]
25522 movh [r0 + 1215 * 16 + 8], m5
25523
25524 ; mode 21 [row 0]
25525 movu m6, [r5 + 15 * 16]
25526 movu m0, [r3 ]
25527 movu m1, [r3 + 1 ]
25528 punpcklbw m0, m1
25529 pmaddubsw m1, m0, m6
25530 pmulhrsw m1, m7
25531 movu m2, [r3 + 8]
25532 movu m3, [r3 + 9]
25533 punpcklbw m2, m3
25534 pmaddubsw m3, m2, m6
25535 pmulhrsw m3, m7
25536 packuswb m1, m3
25537 movu [r0 + 1216 * 16], m1
25538
25539 movu m1, [r3 + 16]
25540 movu m3, [r3 + 17]
25541 punpcklbw m1, m3
25542 pmaddubsw m4, m1, m6
25543 pmulhrsw m4, m7
25544 movu m3, [r3 + 24]
25545 movu m5, [r3 + 25]
25546 punpcklbw m3, m5
25547 pmaddubsw m5, m3, m6
25548 pmulhrsw m5, m7
25549 packuswb m4, m5
25550 movu [r0 + 1217 * 16], m4
25551
25552 ; mode 21 [row 1]
25553 movu m6, [r5 + 30 * 16]
25554 pslldq m0, 2
25555 pinsrb m0, [r4 + 0], 1
25556 pinsrb m0, [r4 + 2], 0
25557 pmaddubsw m4, m0, m6
25558 pmulhrsw m4, m7
25559 pslldq m2, 2
25560 pinsrw m2, [r3 + 7], 0
25561 pmaddubsw m5, m2, m6
25562 pmulhrsw m5, m7
25563 packuswb m4, m5
25564 movu [r0 + 1218 * 16], m4
25565 pslldq m1, 2
25566 pinsrw m1, [r3 + 15], 0
25567 pmaddubsw m4, m1, m6
25568 pmulhrsw m4, m7
25569 pslldq m3, 2
25570 pinsrw m3, [r3 + 23], 0
25571 pmaddubsw m5, m3, m6
25572 pmulhrsw m5, m7
25573 packuswb m4, m5
25574 movu [r0 + 1219 * 16], m4
25575
25576 ; mode 21 [row 2]
25577 movu m6, [r5 + 13 * 16]
25578 pmaddubsw m4, m0, m6
25579 pmulhrsw m4, m7
25580 pmaddubsw m5, m2, m6
25581 pmulhrsw m5, m7
25582 packuswb m4, m5
25583 movu [r0 + 1220 * 16], m4
25584 pmaddubsw m4, m1, m6
25585 pmulhrsw m4, m7
25586 pmaddubsw m5, m3, m6
25587 pmulhrsw m5, m7
25588 packuswb m4, m5
25589 movu [r0 + 1221 * 16], m4
25590
25591 ; mode 21 [row 3]
25592 movu m6, [r5 + 28 * 16]
25593 pslldq m0, 2
25594 pinsrb m0, [r4 + 2], 1
25595 pinsrb m0, [r4 + 4], 0
25596 pmaddubsw m4, m0, m6
25597 pmulhrsw m4, m7
25598 pslldq m2, 2
25599 pinsrw m2, [r3 + 6], 0
25600 pmaddubsw m5, m2, m6
25601 pmulhrsw m5, m7
25602 packuswb m4, m5
25603 movu [r0 + 1222 * 16], m4
25604 pslldq m1, 2
25605 pinsrw m1, [r3 + 14], 0
25606 pmaddubsw m4, m1, m6
25607 pmulhrsw m4, m7
25608 pslldq m3, 2
25609 pinsrw m3, [r3 + 22], 0
25610 pmaddubsw m5, m3, m6
25611 pmulhrsw m5, m7
25612 packuswb m4, m5
25613 movu [r0 + 1223 * 16], m4
25614
25615 ; mode 21 [row 4]
25616 movu m6, [r5 + 11 * 16]
25617 pmaddubsw m4, m0, m6
25618 pmulhrsw m4, m7
25619 pmaddubsw m5, m2, m6
25620 pmulhrsw m5, m7
25621 packuswb m4, m5
25622 movu [r0 + 1224 * 16], m4
25623 pmaddubsw m4, m1, m6
25624 pmulhrsw m4, m7
25625 pmaddubsw m5, m3, m6
25626 pmulhrsw m5, m7
25627 packuswb m4, m5
25628 movu [r0 + 1225 * 16], m4
25629
25630 ; mode 21 [row 5]
25631 movu m6, [r5 + 26 * 16]
25632 pslldq m0, 2
25633 pinsrb m0, [r4 + 4], 1
25634 pinsrb m0, [r4 + 6], 0
25635 pmaddubsw m4, m0, m6
25636 pmulhrsw m4, m7
25637 pslldq m2, 2
25638 pinsrw m2, [r3 + 5], 0
25639 pmaddubsw m5, m2, m6
25640 pmulhrsw m5, m7
25641 packuswb m4, m5
25642 movu [r0 + 1226 * 16], m4
25643 pslldq m1, 2
25644 pinsrw m1, [r3 + 13], 0
25645 pmaddubsw m4, m1, m6
25646 pmulhrsw m4, m7
25647 pslldq m3, 2
25648 pinsrw m3, [r3 + 21], 0
25649 pmaddubsw m5, m3, m6
25650 pmulhrsw m5, m7
25651 packuswb m4, m5
25652 movu [r0 + 1227 * 16], m4
25653
25654 ; mode 21 [row 6]
25655 movu m6, [r5 + 9 * 16]
25656 pmaddubsw m4, m0, m6
25657 pmulhrsw m4, m7
25658 pmaddubsw m5, m2, m6
25659 pmulhrsw m5, m7
25660 packuswb m4, m5
25661 movu [r0 + 1228 * 16], m4
25662 pmaddubsw m4, m1, m6
25663 pmulhrsw m4, m7
25664 pmaddubsw m5, m3, m6
25665 pmulhrsw m5, m7
25666 packuswb m4, m5
25667 movu [r0 + 1229 * 16], m4
25668
25669 ; mode 21 [row 7]
25670 movu m6, [r5 + 24 * 16]
25671 pslldq m0, 2
25672 pinsrb m0, [r4 + 6], 1
25673 pinsrb m0, [r4 + 8], 0
25674 pmaddubsw m4, m0, m6
25675 pmulhrsw m4, m7
25676 pslldq m2, 2
25677 pinsrw m2, [r3 + 4], 0
25678 pmaddubsw m5, m2, m6
25679 pmulhrsw m5, m7
25680 packuswb m4, m5
25681 movu [r0 + 1230 * 16], m4
25682 pslldq m1, 2
25683 pinsrw m1, [r3 + 12], 0
25684 pmaddubsw m4, m1, m6
25685 pmulhrsw m4, m7
25686 pslldq m3, 2
25687 pinsrw m3, [r3 + 20], 0
25688 pmaddubsw m5, m3, m6
25689 pmulhrsw m5, m7
25690 packuswb m4, m5
25691 movu [r0 + 1231 * 16], m4
25692
25693 ; mode 21 [row 8]
25694 movu m6, [r5 + 7 * 16]
25695 pmaddubsw m4, m0, m6
25696 pmulhrsw m4, m7
25697 pmaddubsw m5, m2, m6
25698 pmulhrsw m5, m7
25699 packuswb m4, m5
25700 movu [r0 + 1232 * 16], m4
25701 pmaddubsw m4, m1, m6
25702 pmulhrsw m4, m7
25703 pmaddubsw m5, m3, m6
25704 pmulhrsw m5, m7
25705 packuswb m4, m5
25706 movu [r0 + 1233 * 16], m4
25707
25708 ; mode 21 [row 9]
25709 movu m6, [r5 + 22 * 16]
25710 pslldq m0, 2
25711 pinsrb m0, [r4 + 8], 1
25712 pinsrb m0, [r4 + 9], 0
25713 pmaddubsw m4, m0, m6
25714 pmulhrsw m4, m7
25715 pslldq m2, 2
25716 pinsrw m2, [r3 + 3], 0
25717 pmaddubsw m5, m2, m6
25718 pmulhrsw m5, m7
25719 packuswb m4, m5
25720 movu [r0 + 1234 * 16], m4
25721 pslldq m1, 2
25722 pinsrw m1, [r3 + 11], 0
25723 pmaddubsw m4, m1, m6
25724 pmulhrsw m4, m7
25725 pslldq m3, 2
25726 pinsrw m3, [r3 + 19], 0
25727 pmaddubsw m5, m3, m6
25728 pmulhrsw m5, m7
25729 packuswb m4, m5
25730 movu [r0 + 1235 * 16], m4
25731
25732 ; mode 21 [row 10]
25733 movu m6, [r5 + 5 * 16]
25734 pmaddubsw m4, m0, m6
25735 pmulhrsw m4, m7
25736 pmaddubsw m5, m2, m6
25737 pmulhrsw m5, m7
25738 packuswb m4, m5
25739 movu [r0 + 1236 * 16], m4
25740 pmaddubsw m4, m1, m6
25741 pmulhrsw m4, m7
25742 pmaddubsw m5, m3, m6
25743 pmulhrsw m5, m7
25744 packuswb m4, m5
25745 movu [r0 + 1237 * 16], m4
25746
25747 ; mode 21 [row 11]
25748 movu m6, [r5 + 20 * 16]
25749 pslldq m0, 2
25750 pinsrb m0, [r4 + 9], 1
25751 pinsrb m0, [r4 + 11], 0
25752 pmaddubsw m4, m0, m6
25753 pmulhrsw m4, m7
25754 pslldq m2, 2
25755 pinsrw m2, [r3 + 2], 0
25756 pmaddubsw m5, m2, m6
25757 pmulhrsw m5, m7
25758 packuswb m4, m5
25759 movu [r0 + 1238 * 16], m4
25760 pslldq m1, 2
25761 pinsrw m1, [r3 + 10], 0
25762 pmaddubsw m4, m1, m6
25763 pmulhrsw m4, m7
25764 pslldq m3, 2
25765 pinsrw m3, [r3 + 18], 0
25766 pmaddubsw m5, m3, m6
25767 pmulhrsw m5, m7
25768 packuswb m4, m5
25769 movu [r0 + 1239 * 16], m4
25770
25771 ; mode 21 [row 12]
25772 movu m6, [r5 + 3 * 16]
25773 pmaddubsw m4, m0, m6
25774 pmulhrsw m4, m7
25775 pmaddubsw m5, m2, m6
25776 pmulhrsw m5, m7
25777 packuswb m4, m5
25778 movu [r0 + 1240 * 16], m4
25779 pmaddubsw m4, m1, m6
25780 pmulhrsw m4, m7
25781 pmaddubsw m5, m3, m6
25782 pmulhrsw m5, m7
25783 packuswb m4, m5
25784 movu [r0 + 1241 * 16], m4
25785
25786 ; mode 21 [row 13]
25787 movu m6, [r5 + 18 * 16]
25788 pslldq m0, 2
25789 pinsrb m0, [r4 + 11], 1
25790 pinsrb m0, [r4 + 13], 0
25791 pmaddubsw m4, m0, m6
25792 pmulhrsw m4, m7
25793 pslldq m2, 2
25794 pinsrw m2, [r3 + 1], 0
25795 pmaddubsw m5, m2, m6
25796 pmulhrsw m5, m7
25797 packuswb m4, m5
25798 movu [r0 + 1242 * 16], m4
25799 pslldq m1, 2
25800 pinsrw m1, [r3 + 9], 0
25801 pmaddubsw m4, m1, m6
25802 pmulhrsw m4, m7
25803 pslldq m3, 2
25804 pinsrw m3, [r3 + 17], 0
25805 pmaddubsw m5, m3, m6
25806 pmulhrsw m5, m7
25807 packuswb m4, m5
25808 movu [r0 + 1243 * 16], m4
25809
25810 ; mode 21 [row 14]
25811 movu m6, [r5 + 1 * 16]
25812 pmaddubsw m4, m0, m6
25813 pmulhrsw m4, m7
25814 pmaddubsw m5, m2, m6
25815 pmulhrsw m5, m7
25816 packuswb m4, m5
25817 movu [r0 + 1244 * 16], m4
25818 pmaddubsw m4, m1, m6
25819 pmulhrsw m4, m7
25820 pmaddubsw m5, m3, m6
25821 pmulhrsw m5, m7
25822 packuswb m4, m5
25823 movu [r0 + 1245 * 16], m4
25824
25825 ; mode 21 [row 15]
25826 movu m6, [r5 + 16 * 16]
25827 pslldq m0, 2
25828 pinsrb m0, [r4 + 13], 1
25829 pinsrb m0, [r4 + 15], 0
25830 pmaddubsw m4, m0, m6
25831 pmulhrsw m4, m7
25832 pslldq m2, 2
25833 pinsrw m2, [r3 + 0], 0
25834 pmaddubsw m5, m2, m6
25835 pmulhrsw m5, m7
25836 packuswb m4, m5
25837 movu [r0 + 1246 * 16], m4
25838 pslldq m1, 2
25839 pinsrw m1, [r3 + 8], 0
25840 pmaddubsw m4, m1, m6
25841 pmulhrsw m4, m7
25842 pslldq m3, 2
25843 pinsrw m3, [r3 + 16], 0
25844 pmaddubsw m5, m3, m6
25845 pmulhrsw m5, m7
25846 packuswb m4, m5
25847 movu [r0 + 1247 * 16], m4
25848
25849 ; mode 21 [row 16]
25850 movu m6, [r5 + 31 * 16]
25851 pslldq m0, 2
25852 pinsrb m0, [r4 + 15], 1
25853 pinsrb m0, [r4 + 17], 0
25854 pmaddubsw m4, m0, m6
25855 pmulhrsw m4, m7
25856 pslldq m2, 2
25857 pinsrb m2, [r4 + 0], 1
25858 pinsrb m2, [r4 + 2], 0
25859 pmaddubsw m5, m2, m6
25860 pmulhrsw m5, m7
25861 packuswb m4, m5
25862 movu [r0 + 1248 * 16], m4
25863 pslldq m1, 2
25864 pinsrw m1, [r3 + 7], 0
25865 pmaddubsw m4, m1, m6
25866 pmulhrsw m4, m7
25867 pslldq m3, 2
25868 pinsrw m3, [r3 + 15], 0
25869 pmaddubsw m5, m3, m6
25870 pmulhrsw m5, m7
25871 packuswb m4, m5
25872 movu [r0 + 1249 * 16], m4
25873
25874 ; mode 21 [row 17]
25875 movu m6, [r5 + 14 * 16]
25876 pmaddubsw m4, m0, m6
25877 pmulhrsw m4, m7
25878 pmaddubsw m5, m2, m6
25879 pmulhrsw m5, m7
25880 packuswb m4, m5
25881 movu [r0 + 1250 * 16], m4
25882 pmaddubsw m4, m1, m6
25883 pmulhrsw m4, m7
25884 pmaddubsw m5, m3, m6
25885 pmulhrsw m5, m7
25886 packuswb m4, m5
25887 movu [r0 + 1251 * 16], m4
25888
25889 ; mode 21 [row 18]
25890 movu m6, [r5 + 29 * 16]
25891 pslldq m0, 2
25892 pinsrb m0, [r4 + 17], 1
25893 pinsrb m0, [r4 + 19], 0
25894 pmaddubsw m4, m0, m6
25895 pmulhrsw m4, m7
25896 pslldq m2, 2
25897 pinsrb m2, [r4 + 2], 1
25898 pinsrb m2, [r4 + 4], 0
25899 pmaddubsw m5, m2, m6
25900 pmulhrsw m5, m7
25901 packuswb m4, m5
25902 movu [r0 + 1252 * 16], m4
25903 pslldq m1, 2
25904 pinsrb m1, [r3 + 7], 1
25905 pinsrb m1, [r3 + 6], 0
25906 pmaddubsw m4, m1, m6
25907 pmulhrsw m4, m7
25908 pslldq m3, 2
25909 pinsrb m3, [r3 + 15], 1
25910 pinsrb m3, [r3 + 14], 0
25911 pmaddubsw m5, m3, m6
25912 pmulhrsw m5, m7
25913 packuswb m4, m5
25914 movu [r0 + 1253 * 16], m4
25915
25916 ; mode 21 [row 19]
25917 movu m6, [r5 + 12 * 16]
25918 pmaddubsw m4, m0, m6
25919 pmulhrsw m4, m7
25920 pmaddubsw m5, m2, m6
25921 pmulhrsw m5, m7
25922 packuswb m4, m5
25923 movu [r0 + 1254 * 16], m4
25924 pmaddubsw m4, m1, m6
25925 pmulhrsw m4, m7
25926 pmaddubsw m5, m3, m6
25927 pmulhrsw m5, m7
25928 packuswb m4, m5
25929 movu [r0 + 1255 * 16], m4
25930
25931 ; mode 21 [row 20]
25932 movu m6, [r5 + 27 * 16]
25933 pslldq m0, 2
25934 pinsrb m0, [r4 + 19], 1
25935 pinsrb m0, [r4 + 21], 0
25936 pmaddubsw m4, m0, m6
25937 pmulhrsw m4, m7
25938 pslldq m2, 2
25939 pinsrb m2, [r4 + 4], 1
25940 pinsrb m2, [r4 + 6], 0
25941 pmaddubsw m5, m2, m6
25942 pmulhrsw m5, m7
25943 packuswb m4, m5
25944 movu [r0 + 1256 * 16], m4
25945 pslldq m1, 2
25946 pinsrw m1, [r3 + 5], 0
25947 pmaddubsw m4, m1, m6
25948 pmulhrsw m4, m7
25949 pslldq m3, 2
25950 pinsrw m3, [r3 + 13], 0
25951 pmaddubsw m5, m3, m6
25952 pmulhrsw m5, m7
25953 packuswb m4, m5
25954 movu [r0 + 1257 * 16], m4
25955
25956 ; mode 21 [row 21]
25957 movu m6, [r5 + 10 * 16]
25958 pmaddubsw m4, m0, m6
25959 pmulhrsw m4, m7
25960 pmaddubsw m5, m2, m6
25961 pmulhrsw m5, m7
25962 packuswb m4, m5
25963 movu [r0 + 1258 * 16], m4
25964 pmaddubsw m4, m1, m6
25965 pmulhrsw m4, m7
25966 pmaddubsw m5, m3, m6
25967 pmulhrsw m5, m7
25968 packuswb m4, m5
25969 movu [r0 + 1259 * 16], m4
25970
25971 ; mode 21 [row 22]
25972 movu m6, [r5 + 25 * 16]
25973 pslldq m0, 2
25974 pinsrb m0, [r4 + 21], 1
25975 pinsrb m0, [r4 + 23], 0
25976 pmaddubsw m4, m0, m6
25977 pmulhrsw m4, m7
25978 pslldq m2, 2
25979 pinsrb m2, [r4 + 6], 1
25980 pinsrb m2, [r4 + 8], 0
25981 pmaddubsw m5, m2, m6
25982 pmulhrsw m5, m7
25983 packuswb m4, m5
25984 movu [r0 + 1260 * 16], m4
25985 pslldq m1, 2
25986 pinsrw m1, [r3 + 4], 0
25987 pmaddubsw m4, m1, m6
25988 pmulhrsw m4, m7
25989 pslldq m3, 2
25990 pinsrw m3, [r3 + 12], 0
25991 pmaddubsw m5, m3, m6
25992 pmulhrsw m5, m7
25993 packuswb m4, m5
25994 movu [r0 + 1261 * 16], m4
25995
25996 ; mode 21 [row 23]
25997 movu m6, [r5 + 8 * 16]
25998 pmaddubsw m4, m0, m6
25999 pmulhrsw m4, m7
26000 pmaddubsw m5, m2, m6
26001 pmulhrsw m5, m7
26002 packuswb m4, m5
26003 movu [r0 + 1262 * 16], m4
26004 pmaddubsw m4, m1, m6
26005 pmulhrsw m4, m7
26006 pmaddubsw m5, m3, m6
26007 pmulhrsw m5, m7
26008 packuswb m4, m5
26009 movu [r0 + 1263 * 16], m4
26010
26011 ; mode 21 [row 24]
26012 movu m6, [r5 + 23 * 16]
26013 pslldq m0, 2
26014 pinsrb m0, [r4 + 23], 1
26015 pinsrb m0, [r4 + 24], 0
26016 pmaddubsw m4, m0, m6
26017 pmulhrsw m4, m7
26018 pslldq m2, 2
26019 pinsrb m2, [r4 + 8], 1
26020 pinsrb m2, [r4 + 9], 0
26021 pmaddubsw m5, m2, m6
26022 pmulhrsw m5, m7
26023 packuswb m4, m5
26024 movu [r0 + 1264 * 16], m4
26025 pslldq m1, 2
26026 pinsrw m1, [r3 + 3], 0
26027 pmaddubsw m4, m1, m6
26028 pmulhrsw m4, m7
26029 pslldq m3, 2
26030 pinsrw m3, [r3 + 11], 0
26031 pmaddubsw m5, m3, m6
26032 pmulhrsw m5, m7
26033 packuswb m4, m5
26034 movu [r0 + 1265 * 16], m4
26035
26036 ; mode 21 [row 25]
26037 movu m6, [r5 + 6 * 16]
26038 pmaddubsw m4, m0, m6
26039 pmulhrsw m4, m7
26040 pmaddubsw m5, m2, m6
26041 pmulhrsw m5, m7
26042 packuswb m4, m5
26043 movu [r0 + 1266 * 16], m4
26044 pmaddubsw m4, m1, m6
26045 pmulhrsw m4, m7
26046 pmaddubsw m5, m3, m6
26047 pmulhrsw m5, m7
26048 packuswb m4, m5
26049 movu [r0 + 1267 * 16], m4
26050
26051 ; mode 21 [row 26]
26052 movu m6, [r5 + 21 * 16]
26053 pslldq m0, 2
26054 pinsrb m0, [r4 + 24], 1
26055 pinsrb m0, [r4 + 26], 0
26056 pmaddubsw m4, m0, m6
26057 pmulhrsw m4, m7
26058 pslldq m2, 2
26059 pinsrb m2, [r4 + 9], 1
26060 pinsrb m2, [r4 + 11], 0
26061 pmaddubsw m5, m2, m6
26062 pmulhrsw m5, m7
26063 packuswb m4, m5
26064 movu [r0 + 1268 * 16], m4
26065 pslldq m1, 2
26066 pinsrw m1, [r3 + 2], 0
26067 pmaddubsw m4, m1, m6
26068 pmulhrsw m4, m7
26069 pslldq m3, 2
26070 pinsrw m3, [r3 + 10], 0
26071 pmaddubsw m5, m3, m6
26072 pmulhrsw m5, m7
26073 packuswb m4, m5
26074 movu [r0 + 1269 * 16], m4
26075
26076 ; mode 21 [row 27]
26077 movu m6, [r5 + 4 * 16]
26078 pmaddubsw m4, m0, m6
26079 pmulhrsw m4, m7
26080 pmaddubsw m5, m2, m6
26081 pmulhrsw m5, m7
26082 packuswb m4, m5
26083 movu [r0 + 1270 * 16], m4
26084 pmaddubsw m4, m1, m6
26085 pmulhrsw m4, m7
26086 pmaddubsw m5, m3, m6
26087 pmulhrsw m5, m7
26088 packuswb m4, m5
26089 movu [r0 + 1271 * 16], m4
26090
26091 ; mode 21 [row 28]
26092 movu m6, [r5 + 19 * 16]
26093 pslldq m0, 2
26094 pinsrb m0, [r4 + 26], 1
26095 pinsrb m0, [r4 + 28], 0
26096 pmaddubsw m4, m0, m6
26097 pmulhrsw m4, m7
26098 pslldq m2, 2
26099 pinsrb m2, [r4 + 11], 1
26100 pinsrb m2, [r4 + 13], 0
26101 pmaddubsw m5, m2, m6
26102 pmulhrsw m5, m7
26103 packuswb m4, m5
26104 movu [r0 + 1272 * 16], m4
26105 pslldq m1, 2
26106 pinsrw m1, [r3 + 1], 0
26107 pmaddubsw m4, m1, m6
26108 pmulhrsw m4, m7
26109 pslldq m3, 2
26110 pinsrw m3, [r3 + 9], 0
26111 pmaddubsw m5, m3, m6
26112 pmulhrsw m5, m7
26113 packuswb m4, m5
26114 movu [r0 + 1273 * 16], m4
26115
26116 ; mode 21 [row 29]
26117 movu m6, [r5 + 2 * 16]
26118 pmaddubsw m4, m0, m6
26119 pmulhrsw m4, m7
26120 pmaddubsw m5, m2, m6
26121 pmulhrsw m5, m7
26122 packuswb m4, m5
26123 movu [r0 + 1274 * 16], m4
26124 pmaddubsw m4, m1, m6
26125 pmulhrsw m4, m7
26126 pmaddubsw m5, m3, m6
26127 pmulhrsw m5, m7
26128 packuswb m4, m5
26129 movu [r0 + 1275 * 16], m4
26130
26131 ; mode 21 [row 30]
26132 movu m6, [r5 + 17 * 16]
26133 pslldq m0, 2
26134 pinsrb m0, [r4 + 28], 1
26135 pinsrb m0, [r4 + 30], 0
26136 pmaddubsw m4, m0, m6
26137 pmulhrsw m4, m7
26138 pslldq m2, 2
26139 pinsrb m2, [r4 + 13], 1
26140 pinsrb m2, [r4 + 15], 0
26141 pmaddubsw m5, m2, m6
26142 pmulhrsw m5, m7
26143 packuswb m4, m5
26144 movu [r0 + 1276 * 16], m4
26145 pslldq m1, 2
26146 pinsrw m1, [r3 + 0], 0
26147 pmaddubsw m4, m1, m6
26148 pmulhrsw m4, m7
26149 pslldq m3, 2
26150 pinsrw m3, [r3 + 8], 0
26151 pmaddubsw m5, m3, m6
26152 pmulhrsw m5, m7
26153 packuswb m4, m5
26154 movu [r0 + 1277 * 16], m4
26155
26156 ; mode21 [row 31]
26157 pshufb m5, m0, [tab_S2]
26158 movh [r0 + 1278 * 16], m5
26159 pshufb m5, m2, [tab_S2]
26160 movh [r0 + 1278 * 16 + 8], m5
26161 pshufb m5, m1, [tab_S2]
26162 movh [r0 + 1279 * 16], m5
26163 pshufb m5, m3, [tab_S2]
26164 movh [r0 + 1279 * 16 + 8], m5
26165
26166 ; mode 22 [row 0]
26167 movu m6, [r5 + 19 * 16]
26168 movu m0, [r3 ]
26169 movu m1, [r3 + 1 ]
26170 punpcklbw m0, m1
26171 pmaddubsw m1, m0, m6
26172 pmulhrsw m1, m7
26173 movu m2, [r3 + 8]
26174 movu m3, [r3 + 9]
26175 punpcklbw m2, m3
26176 pmaddubsw m3, m2, m6
26177 pmulhrsw m3, m7
26178 packuswb m1, m3
26179 movu [r0 + 1280 * 16], m1
26180
26181 movu m1, [r3 + 16]
26182 movu m3, [r3 + 17]
26183 punpcklbw m1, m3
26184 pmaddubsw m4, m1, m6
26185 pmulhrsw m4, m7
26186 movu m3, [r3 + 24]
26187 movu m5, [r3 + 25]
26188 punpcklbw m3, m5
26189 pmaddubsw m5, m3, m6
26190 pmulhrsw m5, m7
26191 packuswb m4, m5
26192 movu [r0 + 1281 * 16], m4
26193
26194 ; mode 22 [row 1]
26195 movu m6, [r5 + 6 * 16]
26196 pmaddubsw m4, m0, m6
26197 pmulhrsw m4, m7
26198 pmaddubsw m5, m2, m6
26199 pmulhrsw m5, m7
26200 packuswb m4, m5
26201 movu [r0 + 1282 * 16], m4
26202 pmaddubsw m4, m1, m6
26203 pmulhrsw m4, m7
26204 pmaddubsw m5, m3, m6
26205 pmulhrsw m5, m7
26206 packuswb m4, m5
26207 movu [r0 + 1283 * 16], m4
26208
26209 ; mode 22 [row 2]
26210 movu m6, [r5 + 25 * 16]
26211 pslldq m0, 2
26212 pinsrb m0, [r4 + 0], 1
26213 pinsrb m0, [r4 + 2], 0
26214 pmaddubsw m4, m0, m6
26215 pmulhrsw m4, m7
26216 pslldq m2, 2
26217 pinsrw m2, [r3 + 7], 0
26218 pmaddubsw m5, m2, m6
26219 pmulhrsw m5, m7
26220 packuswb m4, m5
26221 movu [r0 + 1284 * 16], m4
26222 pslldq m1, 2
26223 pinsrw m1, [r3 + 15], 0
26224 pmaddubsw m4, m1, m6
26225 pmulhrsw m4, m7
26226 pslldq m3, 2
26227 pinsrw m3, [r3 + 23], 0
26228 pmaddubsw m5, m3, m6
26229 pmulhrsw m5, m7
26230 packuswb m4, m5
26231 movu [r0 + 1285 * 16], m4
26232
26233 ; mode 22 [row 3]
26234 movu m6, [r5 + 12 * 16]
26235 pmaddubsw m4, m0, m6
26236 pmulhrsw m4, m7
26237 pmaddubsw m5, m2, m6
26238 pmulhrsw m5, m7
26239 packuswb m4, m5
26240 movu [r0 + 1286 * 16], m4
26241 pmaddubsw m4, m1, m6
26242 pmulhrsw m4, m7
26243 pmaddubsw m5, m3, m6
26244 pmulhrsw m5, m7
26245 packuswb m4, m5
26246 movu [r0 + 1287 * 16], m4
26247
26248 ; mode 22 [row 4]
26249 movu m6, [r5 + 31 * 16]
26250 pslldq m0, 2
26251 pinsrb m0, [r4 + 2], 1
26252 pinsrb m0, [r4 + 5], 0
26253 pmaddubsw m4, m0, m6
26254 pmulhrsw m4, m7
26255 pslldq m2, 2
26256 pinsrw m2, [r3 + 6], 0
26257 pmaddubsw m5, m2, m6
26258 pmulhrsw m5, m7
26259 packuswb m4, m5
26260 movu [r0 + 1288 * 16], m4
26261 pslldq m1, 2
26262 pinsrw m1, [r3 + 14], 0
26263 pmaddubsw m4, m1, m6
26264 pmulhrsw m4, m7
26265 pslldq m3, 2
26266 pinsrw m3, [r3 + 22], 0
26267 pmaddubsw m5, m3, m6
26268 pmulhrsw m5, m7
26269 packuswb m4, m5
26270 movu [r0 + 1289 * 16], m4
26271
26272 ; mode 22 [row 5]
26273 movu m6, [r5 + 18 * 16]
26274 pmaddubsw m4, m0, m6
26275 pmulhrsw m4, m7
26276 pmaddubsw m5, m2, m6
26277 pmulhrsw m5, m7
26278 packuswb m4, m5
26279 movu [r0 + 1290 * 16], m4
26280 pmaddubsw m4, m1, m6
26281 pmulhrsw m4, m7
26282 pmaddubsw m5, m3, m6
26283 pmulhrsw m5, m7
26284 packuswb m4, m5
26285 movu [r0 + 1291 * 16], m4
26286
26287 ; mode 22 [row 6]
26288 movu m6, [r5 + 5 * 16]
26289 pmaddubsw m4, m0, m6
26290 pmulhrsw m4, m7
26291 pmaddubsw m5, m2, m6
26292 pmulhrsw m5, m7
26293 packuswb m4, m5
26294 movu [r0 + 1292 * 16], m4
26295 pmaddubsw m4, m1, m6
26296 pmulhrsw m4, m7
26297 pmaddubsw m5, m3, m6
26298 pmulhrsw m5, m7
26299 packuswb m4, m5
26300 movu [r0 + 1293 * 16], m4
26301
26302 ; mode 22 [row 7]
26303 movu m6, [r5 + 24 * 16]
26304 pslldq m0, 2
26305 pinsrb m0, [r4 + 5], 1
26306 pinsrb m0, [r4 + 7], 0
26307 pmaddubsw m4, m0, m6
26308 pmulhrsw m4, m7
26309 pslldq m2, 2
26310 pinsrw m2, [r3 + 5], 0
26311 pmaddubsw m5, m2, m6
26312 pmulhrsw m5, m7
26313 packuswb m4, m5
26314 movu [r0 + 1294 * 16], m4
26315 pslldq m1, 2
26316 pinsrw m1, [r3 + 13], 0
26317 pmaddubsw m4, m1, m6
26318 pmulhrsw m4, m7
26319 pslldq m3, 2
26320 pinsrw m3, [r3 + 21], 0
26321 pmaddubsw m5, m3, m6
26322 pmulhrsw m5, m7
26323 packuswb m4, m5
26324 movu [r0 + 1295 * 16], m4
26325
26326 ; mode 22 [row 8]
26327 movu m6, [r5 + 11 * 16]
26328 pmaddubsw m4, m0, m6
26329 pmulhrsw m4, m7
26330 pmaddubsw m5, m2, m6
26331 pmulhrsw m5, m7
26332 packuswb m4, m5
26333 movu [r0 + 1296 * 16], m4
26334 pmaddubsw m4, m1, m6
26335 pmulhrsw m4, m7
26336 pmaddubsw m5, m3, m6
26337 pmulhrsw m5, m7
26338 packuswb m4, m5
26339 movu [r0 + 1297 * 16], m4
26340
26341 ; mode 22 [row 9]
26342 movu m6, [r5 + 30 * 16]
26343 pslldq m0, 2
26344 pinsrb m0, [r4 + 7], 1
26345 pinsrb m0, [r4 + 10], 0
26346 pmaddubsw m4, m0, m6
26347 pmulhrsw m4, m7
26348 pslldq m2, 2
26349 pinsrw m2, [r3 + 4], 0
26350 pmaddubsw m5, m2, m6
26351 pmulhrsw m5, m7
26352 packuswb m4, m5
26353 movu [r0 + 1298 * 16], m4
26354 pslldq m1, 2
26355 pinsrw m1, [r3 + 12], 0
26356 pmaddubsw m4, m1, m6
26357 pmulhrsw m4, m7
26358 pslldq m3, 2
26359 pinsrw m3, [r3 + 20], 0
26360 pmaddubsw m5, m3, m6
26361 pmulhrsw m5, m7
26362 packuswb m4, m5
26363 movu [r0 + 1299 * 16], m4
26364
26365 ; mode 22 [row 10]
26366 movu m6, [r5 + 17 * 16]
26367 pmaddubsw m4, m0, m6
26368 pmulhrsw m4, m7
26369 pmaddubsw m5, m2, m6
26370 pmulhrsw m5, m7
26371 packuswb m4, m5
26372 movu [r0 + 1300 * 16], m4
26373 pmaddubsw m4, m1, m6
26374 pmulhrsw m4, m7
26375 pmaddubsw m5, m3, m6
26376 pmulhrsw m5, m7
26377 packuswb m4, m5
26378 movu [r0 + 1301 * 16], m4
26379
26380 ; mode 22 [row 11]
26381 movu m6, [r5 + 4 * 16]
26382 pmaddubsw m4, m0, m6
26383 pmulhrsw m4, m7
26384 pmaddubsw m5, m2, m6
26385 pmulhrsw m5, m7
26386 packuswb m4, m5
26387 movu [r0 + 1302 * 16], m4
26388 pmaddubsw m4, m1, m6
26389 pmulhrsw m4, m7
26390 pmaddubsw m5, m3, m6
26391 pmulhrsw m5, m7
26392 packuswb m4, m5
26393 movu [r0 + 1303 * 16], m4
26394
26395 ; mode 22 [row 12]
26396 movu m6, [r5 + 23 * 16]
26397 pslldq m0, 2
26398 pinsrb m0, [r4 + 10], 1
26399 pinsrb m0, [r4 + 12], 0
26400 pmaddubsw m4, m0, m6
26401 pmulhrsw m4, m7
26402 pslldq m2, 2
26403 pinsrw m2, [r3 + 3], 0
26404 pmaddubsw m5, m2, m6
26405 pmulhrsw m5, m7
26406 packuswb m4, m5
26407 movu [r0 + 1304 * 16], m4
26408 pslldq m1, 2
26409 pinsrw m1, [r3 + 11], 0
26410 pmaddubsw m4, m1, m6
26411 pmulhrsw m4, m7
26412 pslldq m3, 2
26413 pinsrw m3, [r3 + 19], 0
26414 pmaddubsw m5, m3, m6
26415 pmulhrsw m5, m7
26416 packuswb m4, m5
26417 movu [r0 + 1305 * 16], m4
26418
26419 ; mode 22 [row 13]
26420 movu m6, [r5 + 10 * 16]
26421 pmaddubsw m4, m0, m6
26422 pmulhrsw m4, m7
26423 pmaddubsw m5, m2, m6
26424 pmulhrsw m5, m7
26425 packuswb m4, m5
26426 movu [r0 + 1306 * 16], m4
26427 pmaddubsw m4, m1, m6
26428 pmulhrsw m4, m7
26429 pmaddubsw m5, m3, m6
26430 pmulhrsw m5, m7
26431 packuswb m4, m5
26432 movu [r0 + 1307 * 16], m4
26433
26434 ; mode 22 [row 14]
26435 movu m6, [r5 + 29 * 16]
26436 pslldq m0, 2
26437 pinsrb m0, [r4 + 12], 1
26438 pinsrb m0, [r4 + 15], 0
26439 pmaddubsw m4, m0, m6
26440 pmulhrsw m4, m7
26441 pslldq m2, 2
26442 pinsrw m2, [r3 + 2], 0
26443 pmaddubsw m5, m2, m6
26444 pmulhrsw m5, m7
26445 packuswb m4, m5
26446 movu [r0 + 1308 * 16], m4
26447 pslldq m1, 2
26448 pinsrw m1, [r3 + 10], 0
26449 pmaddubsw m4, m1, m6
26450 pmulhrsw m4, m7
26451 pslldq m3, 2
26452 pinsrw m3, [r3 + 18], 0
26453 pmaddubsw m5, m3, m6
26454 pmulhrsw m5, m7
26455 packuswb m4, m5
26456 movu [r0 + 1309 * 16], m4
26457
26458 ; mode 22 [row 15]
26459 movu m6, [r5 + 16 * 16]
26460 pmaddubsw m4, m0, m6
26461 pmulhrsw m4, m7
26462 pmaddubsw m5, m2, m6
26463 pmulhrsw m5, m7
26464 packuswb m4, m5
26465 movu [r0 + 1310 * 16], m4
26466 pmaddubsw m4, m1, m6
26467 pmulhrsw m4, m7
26468 pmaddubsw m5, m3, m6
26469 pmulhrsw m5, m7
26470 packuswb m4, m5
26471 movu [r0 + 1311 * 16], m4
26472
26473 ; mode 22 [row 16]
26474 movu m6, [r5 + 3 * 16]
26475 pmaddubsw m4, m0, m6
26476 pmulhrsw m4, m7
26477 pmaddubsw m5, m2, m6
26478 pmulhrsw m5, m7
26479 packuswb m4, m5
26480 movu [r0 + 1312 * 16], m4
26481 pmaddubsw m4, m1, m6
26482 pmulhrsw m4, m7
26483 pmaddubsw m5, m3, m6
26484 pmulhrsw m5, m7
26485 packuswb m4, m5
26486 movu [r0 + 1313 * 16], m4
26487
26488 ; mode 22 [row 17]
26489 movu m6, [r5 + 22 * 16]
26490 pslldq m0, 2
26491 pinsrb m0, [r4 + 15], 1
26492 pinsrb m0, [r4 + 17], 0
26493 pmaddubsw m4, m0, m6
26494 pmulhrsw m4, m7
26495 pslldq m2, 2
26496 pinsrw m2, [r3 + 1], 0
26497 pmaddubsw m5, m2, m6
26498 pmulhrsw m5, m7
26499 packuswb m4, m5
26500 movu [r0 + 1314 * 16], m4
26501 pslldq m1, 2
26502 pinsrw m1, [r3 + 9], 0
26503 pmaddubsw m4, m1, m6
26504 pmulhrsw m4, m7
26505 pslldq m3, 2
26506 pinsrw m3, [r3 + 17], 0
26507 pmaddubsw m5, m3, m6
26508 pmulhrsw m5, m7
26509 packuswb m4, m5
26510 movu [r0 + 1315 * 16], m4
26511
26512 ; mode 22 [row 18]
26513 movu m6, [r5 + 9 * 16]
26514 pmaddubsw m4, m0, m6
26515 pmulhrsw m4, m7
26516 pmaddubsw m5, m2, m6
26517 pmulhrsw m5, m7
26518 packuswb m4, m5
26519 movu [r0 + 1316 * 16], m4
26520 pmaddubsw m4, m1, m6
26521 pmulhrsw m4, m7
26522 pmaddubsw m5, m3, m6
26523 pmulhrsw m5, m7
26524 packuswb m4, m5
26525 movu [r0 + 1317 * 16], m4
26526
26527 ; mode 22 [row 19]
26528 movu m6, [r5 + 28 * 16]
26529 pslldq m0, 2
26530 pinsrb m0, [r4 + 17], 1
26531 pinsrb m0, [r4 + 20], 0
26532 pmaddubsw m4, m0, m6
26533 pmulhrsw m4, m7
26534 pslldq m2, 2
26535 pinsrw m2, [r3 + 0], 0
26536 pmaddubsw m5, m2, m6
26537 pmulhrsw m5, m7
26538 packuswb m4, m5
26539 movu [r0 + 1318 * 16], m4
26540 pslldq m1, 2
26541 pinsrw m1, [r3 + 8], 0
26542 pmaddubsw m4, m1, m6
26543 pmulhrsw m4, m7
26544 pslldq m3, 2
26545 pinsrw m3, [r3 + 16], 0
26546 pmaddubsw m5, m3, m6
26547 pmulhrsw m5, m7
26548 packuswb m4, m5
26549 movu [r0 + 1319 * 16], m4
26550
26551 ; mode 22 [row 20]
26552 movu m6, [r5 + 15 * 16]
26553 pmaddubsw m4, m0, m6
26554 pmulhrsw m4, m7
26555 pmaddubsw m5, m2, m6
26556 pmulhrsw m5, m7
26557 packuswb m4, m5
26558 movu [r0 + 1320 * 16], m4
26559 pmaddubsw m4, m1, m6
26560 pmulhrsw m4, m7
26561 pmaddubsw m5, m3, m6
26562 pmulhrsw m5, m7
26563 packuswb m4, m5
26564 movu [r0 + 1321 * 16], m4
26565
26566 ; mode 22 [row 21]
26567 movu m6, [r5 + 2 * 16]
26568 pmaddubsw m4, m0, m6
26569 pmulhrsw m4, m7
26570 pmaddubsw m5, m2, m6
26571 pmulhrsw m5, m7
26572 packuswb m4, m5
26573 movu [r0 + 1322 * 16], m4
26574 pmaddubsw m4, m1, m6
26575 pmulhrsw m4, m7
26576 pmaddubsw m5, m3, m6
26577 pmulhrsw m5, m7
26578 packuswb m4, m5
26579 movu [r0 + 1323 * 16], m4
26580
26581 ; mode 22 [row 22]
26582 movu m6, [r5 + 21 * 16]
26583 pslldq m0, 2
26584 pinsrb m0, [r4 + 20], 1
26585 pinsrb m0, [r4 + 22], 0
26586 pmaddubsw m4, m0, m6
26587 pmulhrsw m4, m7
26588 pslldq m2, 2
26589 pinsrb m2, [r4 + 0], 1
26590 pinsrb m2, [r4 + 2], 0
26591 pmaddubsw m5, m2, m6
26592 pmulhrsw m5, m7
26593 packuswb m4, m5
26594 movu [r0 + 1324 * 16], m4
26595 pslldq m1, 2
26596 pinsrw m1, [r3 + 7], 0
26597 pmaddubsw m4, m1, m6
26598 pmulhrsw m4, m7
26599 pslldq m3, 2
26600 pinsrw m3, [r3 + 15], 0
26601 pmaddubsw m5, m3, m6
26602 pmulhrsw m5, m7
26603 packuswb m4, m5
26604 movu [r0 + 1325 * 16], m4
26605
26606 ; mode 22 [row 23]
26607 movu m6, [r5 + 8 * 16]
26608 pmaddubsw m4, m0, m6
26609 pmulhrsw m4, m7
26610 pmaddubsw m5, m2, m6
26611 pmulhrsw m5, m7
26612 packuswb m4, m5
26613 movu [r0 + 1326 * 16], m4
26614 pmaddubsw m4, m1, m6
26615 pmulhrsw m4, m7
26616 pmaddubsw m5, m3, m6
26617 pmulhrsw m5, m7
26618 packuswb m4, m5
26619 movu [r0 + 1327 * 16], m4
26620
26621 ; mode 22 [row 24]
26622 movu m6, [r5 + 27 * 16]
26623 pslldq m0, 2
26624 pinsrb m0, [r4 + 22], 1
26625 pinsrb m0, [r4 + 25], 0
26626 pmaddubsw m4, m0, m6
26627 pmulhrsw m4, m7
26628 pslldq m2, 2
26629 pinsrb m2, [r4 + 2], 1
26630 pinsrb m2, [r4 + 5], 0
26631 pmaddubsw m5, m2, m6
26632 pmulhrsw m5, m7
26633 packuswb m4, m5
26634 movu [r0 + 1328 * 16], m4
26635 pslldq m1, 2
26636 pinsrw m1, [r3 + 6], 0
26637 pmaddubsw m4, m1, m6
26638 pmulhrsw m4, m7
26639 pslldq m3, 2
26640 pinsrw m3, [r3 + 14], 0
26641 pmaddubsw m5, m3, m6
26642 pmulhrsw m5, m7
26643 packuswb m4, m5
26644 movu [r0 + 1329 * 16], m4
26645
26646 ; mode 22 [row 25]
26647 movu m6, [r5 + 14 * 16]
26648 pmaddubsw m4, m0, m6
26649 pmulhrsw m4, m7
26650 pmaddubsw m5, m2, m6
26651 pmulhrsw m5, m7
26652 packuswb m4, m5
26653 movu [r0 + 1330 * 16], m4
26654 pmaddubsw m4, m1, m6
26655 pmulhrsw m4, m7
26656 pmaddubsw m5, m3, m6
26657 pmulhrsw m5, m7
26658 packuswb m4, m5
26659 movu [r0 + 1331 * 16], m4
26660
26661 ; mode 22 [row 26]
26662 movu m6, [r5 + 1 * 16]
26663 pmaddubsw m4, m0, m6
26664 pmulhrsw m4, m7
26665 pmaddubsw m5, m2, m6
26666 pmulhrsw m5, m7
26667 packuswb m4, m5
26668 movu [r0 + 1332 * 16], m4
26669 pmaddubsw m4, m1, m6
26670 pmulhrsw m4, m7
26671 pmaddubsw m5, m3, m6
26672 pmulhrsw m5, m7
26673 packuswb m4, m5
26674 movu [r0 + 1333 * 16], m4
26675
26676 ; mode 22 [row 27]
26677 movu m6, [r5 + 20 * 16]
26678 pslldq m0, 2
26679 pinsrb m0, [r4 + 25], 1
26680 pinsrb m0, [r4 + 27], 0
26681 pmaddubsw m4, m0, m6
26682 pmulhrsw m4, m7
26683 pslldq m2, 2
26684 pinsrb m2, [r4 + 5], 1
26685 pinsrb m2, [r4 + 7], 0
26686 pmaddubsw m5, m2, m6
26687 pmulhrsw m5, m7
26688 packuswb m4, m5
26689 movu [r0 + 1334 * 16], m4
26690 pslldq m1, 2
26691 pinsrw m1, [r3 + 5], 0
26692 pmaddubsw m4, m1, m6
26693 pmulhrsw m4, m7
26694 pslldq m3, 2
26695 pinsrw m3, [r3 + 13], 0
26696 pmaddubsw m5, m3, m6
26697 pmulhrsw m5, m7
26698 packuswb m4, m5
26699 movu [r0 + 1335 * 16], m4
26700
26701 ; mode 22 [row 28]
26702 movu m6, [r5 + 7 * 16]
26703 pmaddubsw m4, m0, m6
26704 pmulhrsw m4, m7
26705 pmaddubsw m5, m2, m6
26706 pmulhrsw m5, m7
26707 packuswb m4, m5
26708 movu [r0 + 1336 * 16], m4
26709 pmaddubsw m4, m1, m6
26710 pmulhrsw m4, m7
26711 pmaddubsw m5, m3, m6
26712 pmulhrsw m5, m7
26713 packuswb m4, m5
26714 movu [r0 + 1337 * 16], m4
26715
26716 ; mode 22 [row 29]
26717 movu m6, [r5 + 26 * 16]
26718 pslldq m0, 2
26719 pinsrb m0, [r4 + 27], 1
26720 pinsrb m0, [r4 + 30], 0
26721 pmaddubsw m4, m0, m6
26722 pmulhrsw m4, m7
26723 pslldq m2, 2
26724 pinsrb m2, [r4 + 7], 1
26725 pinsrb m2, [r4 + 10], 0
26726 pmaddubsw m5, m2, m6
26727 pmulhrsw m5, m7
26728 packuswb m4, m5
26729 movu [r0 + 1338 * 16], m4
26730 pslldq m1, 2
26731 pinsrw m1, [r3 + 4], 0
26732 pmaddubsw m4, m1, m6
26733 pmulhrsw m4, m7
26734 pslldq m3, 2
26735 pinsrw m3, [r3 + 12], 0
26736 pmaddubsw m5, m3, m6
26737 pmulhrsw m5, m7
26738 packuswb m4, m5
26739 movu [r0 + 1339 * 16], m4
26740
26741 ; mode 22 [row 30]
26742 movu m6, [r5 + 13 * 16]
26743 pmaddubsw m4, m0, m6
26744 pmulhrsw m4, m7
26745 pmaddubsw m5, m2, m6
26746 pmulhrsw m5, m7
26747 packuswb m4, m5
26748 movu [r0 + 1340 * 16], m4
26749 pmaddubsw m4, m1, m6
26750 pmulhrsw m4, m7
26751 pmaddubsw m5, m3, m6
26752 pmulhrsw m5, m7
26753 packuswb m4, m5
26754 movu [r0 + 1341 * 16], m4
26755
26756 ; mode22 [row 31]
26757 pshufb m5, m0, [tab_S2]
26758 movh [r0 + 1342 * 16], m5
26759 pshufb m5, m2, [tab_S2]
26760 movh [r0 + 1342 * 16 + 8], m5
26761 pshufb m5, m1, [tab_S2]
26762 movh [r0 + 1343 * 16], m5
26763 pshufb m5, m3, [tab_S2]
26764 movh [r0 + 1343 * 16 + 8], m5
26765
26766 ; mode 23 [row 0]
26767 movu m6, [r5 + 23 * 16]
26768 movu m0, [r3 ]
26769 movu m1, [r3 + 1 ]
26770 punpcklbw m0, m1
26771 pmaddubsw m1, m0, m6
26772 pmulhrsw m1, m7
26773 movu m2, [r3 + 8]
26774 movu m3, [r3 + 9]
26775 punpcklbw m2, m3
26776 pmaddubsw m3, m2, m6
26777 pmulhrsw m3, m7
26778 packuswb m1, m3
26779 movu [r0 + 1344 * 16], m1
26780
26781 movu m1, [r3 + 16]
26782 movu m3, [r3 + 17]
26783 punpcklbw m1, m3
26784 pmaddubsw m4, m1, m6
26785 pmulhrsw m4, m7
26786 movu m3, [r3 + 24]
26787 movu m5, [r3 + 25]
26788 punpcklbw m3, m5
26789 pmaddubsw m5, m3, m6
26790 pmulhrsw m5, m7
26791 packuswb m4, m5
26792 movu [r0 + 1345 * 16], m4
26793
26794 ; mode 23 [row 1]
26795 movu m6, [r5 + 14 * 16]
26796 pmaddubsw m4, m0, m6
26797 pmulhrsw m4, m7
26798 pmaddubsw m5, m2, m6
26799 pmulhrsw m5, m7
26800 packuswb m4, m5
26801 movu [r0 + 1346 * 16], m4
26802 pmaddubsw m4, m1, m6
26803 pmulhrsw m4, m7
26804 pmaddubsw m5, m3, m6
26805 pmulhrsw m5, m7
26806 packuswb m4, m5
26807 movu [r0 + 1347 * 16], m4
26808
26809 ; mode 23 [row 2]
26810 movu m6, [r5 + 5 * 16]
26811 pmaddubsw m4, m0, m6
26812 pmulhrsw m4, m7
26813 pmaddubsw m5, m2, m6
26814 pmulhrsw m5, m7
26815 packuswb m4, m5
26816 movu [r0 + 1348 * 16], m4
26817 pmaddubsw m4, m1, m6
26818 pmulhrsw m4, m7
26819 pmaddubsw m5, m3, m6
26820 pmulhrsw m5, m7
26821 packuswb m4, m5
26822 movu [r0 + 1349 * 16], m4
26823
26824 ; mode 23 [row 3]
26825 movu m6, [r5 + 28 * 16]
26826 pslldq m0, 2
26827 pinsrb m0, [r4 + 0], 1
26828 pinsrb m0, [r4 + 4], 0
26829 pmaddubsw m4, m0, m6
26830 pmulhrsw m4, m7
26831 pslldq m2, 2
26832 pinsrw m2, [r3 + 7], 0
26833 pmaddubsw m5, m2, m6
26834 pmulhrsw m5, m7
26835 packuswb m4, m5
26836 movu [r0 + 1350 * 16], m4
26837 pslldq m1, 2
26838 pinsrw m1, [r3 + 15], 0
26839 pmaddubsw m4, m1, m6
26840 pmulhrsw m4, m7
26841 pslldq m3, 2
26842 pinsrw m3, [r3 + 23], 0
26843 pmaddubsw m5, m3, m6
26844 pmulhrsw m5, m7
26845 packuswb m4, m5
26846 movu [r0 + 1351 * 16], m4
26847
26848 ; mode 23 [row 4]
26849 movu m6, [r5 + 19 * 16]
26850 pmaddubsw m4, m0, m6
26851 pmulhrsw m4, m7
26852 pmaddubsw m5, m2, m6
26853 pmulhrsw m5, m7
26854 packuswb m4, m5
26855 movu [r0 + 1352 * 16], m4
26856 pmaddubsw m4, m1, m6
26857 pmulhrsw m4, m7
26858 pmaddubsw m5, m3, m6
26859 pmulhrsw m5, m7
26860 packuswb m4, m5
26861 movu [r0 + 1353 * 16], m4
26862
26863 ; mode 23 [row 5]
26864 movu m6, [r5 + 10 * 16]
26865 pmaddubsw m4, m0, m6
26866 pmulhrsw m4, m7
26867 pmaddubsw m5, m2, m6
26868 pmulhrsw m5, m7
26869 packuswb m4, m5
26870 movu [r0 + 1354 * 16], m4
26871 pmaddubsw m4, m1, m6
26872 pmulhrsw m4, m7
26873 pmaddubsw m5, m3, m6
26874 pmulhrsw m5, m7
26875 packuswb m4, m5
26876 movu [r0 + 1355 * 16], m4
26877
26878 ; mode 23 [row 6]
26879 movu m6, [r5 + 1 * 16]
26880 pmaddubsw m4, m0, m6
26881 pmulhrsw m4, m7
26882 pmaddubsw m5, m2, m6
26883 pmulhrsw m5, m7
26884 packuswb m4, m5
26885 movu [r0 + 1356 * 16], m4
26886 pmaddubsw m4, m1, m6
26887 pmulhrsw m4, m7
26888 pmaddubsw m5, m3, m6
26889 pmulhrsw m5, m7
26890 packuswb m4, m5
26891 movu [r0 + 1357 * 16], m4
26892
26893 ; mode 23 [row 7]
26894 movu m6, [r5 + 24 * 16]
26895 pslldq m0, 2
26896 pinsrb m0, [r4 + 4], 1
26897 pinsrb m0, [r4 + 7], 0
26898 pmaddubsw m4, m0, m6
26899 pmulhrsw m4, m7
26900 pslldq m2, 2
26901 pinsrw m2, [r3 + 6], 0
26902 pmaddubsw m5, m2, m6
26903 pmulhrsw m5, m7
26904 packuswb m4, m5
26905 movu [r0 + 1358 * 16], m4
26906 pslldq m1, 2
26907 pinsrw m1, [r3 + 14], 0
26908 pmaddubsw m4, m1, m6
26909 pmulhrsw m4, m7
26910 pslldq m3, 2
26911 pinsrw m3, [r3 + 22], 0
26912 pmaddubsw m5, m3, m6
26913 pmulhrsw m5, m7
26914 packuswb m4, m5
26915 movu [r0 + 1359 * 16], m4
26916
26917 ; mode 23 [row 8]
26918 movu m6, [r5 + 15 * 16]
26919 pmaddubsw m4, m0, m6
26920 pmulhrsw m4, m7
26921 pmaddubsw m5, m2, m6
26922 pmulhrsw m5, m7
26923 packuswb m4, m5
26924 movu [r0 + 1360 * 16], m4
26925 pmaddubsw m4, m1, m6
26926 pmulhrsw m4, m7
26927 pmaddubsw m5, m3, m6
26928 pmulhrsw m5, m7
26929 packuswb m4, m5
26930 movu [r0 + 1361 * 16], m4
26931
26932 ; mode 23 [row 9]
26933 movu m6, [r5 + 6 * 16]
26934 pmaddubsw m4, m0, m6
26935 pmulhrsw m4, m7
26936 pmaddubsw m5, m2, m6
26937 pmulhrsw m5, m7
26938 packuswb m4, m5
26939 movu [r0 + 1362 * 16], m4
26940 pmaddubsw m4, m1, m6
26941 pmulhrsw m4, m7
26942 pmaddubsw m5, m3, m6
26943 pmulhrsw m5, m7
26944 packuswb m4, m5
26945 movu [r0 + 1363 * 16], m4
26946
26947 ; mode 23 [row 10]
26948 movu m6, [r5 + 29 * 16]
26949 pslldq m0, 2
26950 pinsrb m0, [r4 + 7], 1
26951 pinsrb m0, [r4 + 11], 0
26952 pmaddubsw m4, m0, m6
26953 pmulhrsw m4, m7
26954 pslldq m2, 2
26955 pinsrw m2, [r3 + 5], 0
26956 pmaddubsw m5, m2, m6
26957 pmulhrsw m5, m7
26958 packuswb m4, m5
26959 movu [r0 + 1364 * 16], m4
26960 pslldq m1, 2
26961 pinsrw m1, [r3 + 13], 0
26962 pmaddubsw m4, m1, m6
26963 pmulhrsw m4, m7
26964 pslldq m3, 2
26965 pinsrw m3, [r3 + 21], 0
26966 pmaddubsw m5, m3, m6
26967 pmulhrsw m5, m7
26968 packuswb m4, m5
26969 movu [r0 + 1365 * 16], m4
26970
26971 ; mode 23 [row 11]
26972 movu m6, [r5 + 20 * 16]
26973 pmaddubsw m4, m0, m6
26974 pmulhrsw m4, m7
26975 pmaddubsw m5, m2, m6
26976 pmulhrsw m5, m7
26977 packuswb m4, m5
26978 movu [r0 + 1366 * 16], m4
26979 pmaddubsw m4, m1, m6
26980 pmulhrsw m4, m7
26981 pmaddubsw m5, m3, m6
26982 pmulhrsw m5, m7
26983 packuswb m4, m5
26984 movu [r0 + 1367 * 16], m4
26985
26986 ; mode 23 [row 12]
26987 movu m6, [r5 + 11 * 16]
26988 pmaddubsw m4, m0, m6
26989 pmulhrsw m4, m7
26990 pmaddubsw m5, m2, m6
26991 pmulhrsw m5, m7
26992 packuswb m4, m5
26993 movu [r0 + 1368 * 16], m4
26994 pmaddubsw m4, m1, m6
26995 pmulhrsw m4, m7
26996 pmaddubsw m5, m3, m6
26997 pmulhrsw m5, m7
26998 packuswb m4, m5
26999 movu [r0 + 1369 * 16], m4
27000
27001 ; mode 23 [row 13]
27002 movu m6, [r5 + 2 * 16]
27003 pmaddubsw m4, m0, m6
27004 pmulhrsw m4, m7
27005 pmaddubsw m5, m2, m6
27006 pmulhrsw m5, m7
27007 packuswb m4, m5
27008 movu [r0 + 1370 * 16], m4
27009 pmaddubsw m4, m1, m6
27010 pmulhrsw m4, m7
27011 pmaddubsw m5, m3, m6
27012 pmulhrsw m5, m7
27013 packuswb m4, m5
27014 movu [r0 + 1371 * 16], m4
27015
27016 ; mode 23 [row 14]
27017 movu m6, [r5 + 25 * 16]
27018 pslldq m0, 2
27019 pinsrb m0, [r4 + 11], 1
27020 pinsrb m0, [r4 + 14], 0
27021 pmaddubsw m4, m0, m6
27022 pmulhrsw m4, m7
27023 pslldq m2, 2
27024 pinsrw m2, [r3 + 4], 0
27025 pmaddubsw m5, m2, m6
27026 pmulhrsw m5, m7
27027 packuswb m4, m5
27028 movu [r0 + 1372 * 16], m4
27029 pslldq m1, 2
27030 pinsrw m1, [r3 + 12], 0
27031 pmaddubsw m4, m1, m6
27032 pmulhrsw m4, m7
27033 pslldq m3, 2
27034 pinsrw m3, [r3 + 20], 0
27035 pmaddubsw m5, m3, m6
27036 pmulhrsw m5, m7
27037 packuswb m4, m5
27038 movu [r0 + 1373 * 16], m4
27039
27040 ; mode 23 [row 15]
27041 movu m6, [r5 + 16 * 16]
27042 pmaddubsw m4, m0, m6
27043 pmulhrsw m4, m7
27044 pmaddubsw m5, m2, m6
27045 pmulhrsw m5, m7
27046 packuswb m4, m5
27047 movu [r0 + 1374 * 16], m4
27048 pmaddubsw m4, m1, m6
27049 pmulhrsw m4, m7
27050 pmaddubsw m5, m3, m6
27051 pmulhrsw m5, m7
27052 packuswb m4, m5
27053 movu [r0 + 1375 * 16], m4
27054
27055 ; mode 23 [row 16]
27056 movu m6, [r5 + 7 * 16]
27057 pmaddubsw m4, m0, m6
27058 pmulhrsw m4, m7
27059 pmaddubsw m5, m2, m6
27060 pmulhrsw m5, m7
27061 packuswb m4, m5
27062 movu [r0 + 1376 * 16], m4
27063 pmaddubsw m4, m1, m6
27064 pmulhrsw m4, m7
27065 pmaddubsw m5, m3, m6
27066 pmulhrsw m5, m7
27067 packuswb m4, m5
27068 movu [r0 + 1377 * 16], m4
27069
27070 ; mode 23 [row 17]
27071 movu m6, [r5 + 30 * 16]
27072 pslldq m0, 2
27073 pinsrb m0, [r4 + 14], 1
27074 pinsrb m0, [r4 + 18], 0
27075 pmaddubsw m4, m0, m6
27076 pmulhrsw m4, m7
27077 pslldq m2, 2
27078 pinsrw m2, [r3 + 3], 0
27079 pmaddubsw m5, m2, m6
27080 pmulhrsw m5, m7
27081 packuswb m4, m5
27082 movu [r0 + 1378 * 16], m4
27083 pslldq m1, 2
27084 pinsrw m1, [r3 + 11], 0
27085 pmaddubsw m4, m1, m6
27086 pmulhrsw m4, m7
27087 pslldq m3, 2
27088 pinsrw m3, [r3 + 19], 0
27089 pmaddubsw m5, m3, m6
27090 pmulhrsw m5, m7
27091 packuswb m4, m5
27092 movu [r0 + 1379 * 16], m4
27093
27094 ; mode 23 [row 18]
27095 movu m6, [r5 + 21 * 16]
27096 pmaddubsw m4, m0, m6
27097 pmulhrsw m4, m7
27098 pmaddubsw m5, m2, m6
27099 pmulhrsw m5, m7
27100 packuswb m4, m5
27101 movu [r0 + 1380 * 16], m4
27102 pmaddubsw m4, m1, m6
27103 pmulhrsw m4, m7
27104 pmaddubsw m5, m3, m6
27105 pmulhrsw m5, m7
27106 packuswb m4, m5
27107 movu [r0 + 1381 * 16], m4
27108
27109 ; mode 23 [row 19]
27110 movu m6, [r5 + 12 * 16]
27111 pmaddubsw m4, m0, m6
27112 pmulhrsw m4, m7
27113 pmaddubsw m5, m2, m6
27114 pmulhrsw m5, m7
27115 packuswb m4, m5
27116 movu [r0 + 1382 * 16], m4
27117 pmaddubsw m4, m1, m6
27118 pmulhrsw m4, m7
27119 pmaddubsw m5, m3, m6
27120 pmulhrsw m5, m7
27121 packuswb m4, m5
27122 movu [r0 + 1383 * 16], m4
27123
27124 ; mode 23 [row 20]
27125 movu m6, [r5 + 3 * 16]
27126 pmaddubsw m4, m0, m6
27127 pmulhrsw m4, m7
27128 pmaddubsw m5, m2, m6
27129 pmulhrsw m5, m7
27130 packuswb m4, m5
27131 movu [r0 + 1384 * 16], m4
27132 pmaddubsw m4, m1, m6
27133 pmulhrsw m4, m7
27134 pmaddubsw m5, m3, m6
27135 pmulhrsw m5, m7
27136 packuswb m4, m5
27137 movu [r0 + 1385 * 16], m4
27138
27139 ; mode 23 [row 21]
27140 movu m6, [r5 + 26 * 16]
27141 pslldq m0, 2
27142 pinsrb m0, [r4 + 18], 1
27143 pinsrb m0, [r4 + 21], 0
27144 pmaddubsw m4, m0, m6
27145 pmulhrsw m4, m7
27146 pslldq m2, 2
27147 pinsrw m2, [r3 + 2], 0
27148 pmaddubsw m5, m2, m6
27149 pmulhrsw m5, m7
27150 packuswb m4, m5
27151 movu [r0 + 1386 * 16], m4
27152 pslldq m1, 2
27153 pinsrw m1, [r3 + 10], 0
27154 pmaddubsw m4, m1, m6
27155 pmulhrsw m4, m7
27156 pslldq m3, 2
27157 pinsrw m3, [r3 + 18], 0
27158 pmaddubsw m5, m3, m6
27159 pmulhrsw m5, m7
27160 packuswb m4, m5
27161 movu [r0 + 1387 * 16], m4
27162
27163 ; mode 23 [row 22]
27164 movu m6, [r5 + 17 * 16]
27165 pmaddubsw m4, m0, m6
27166 pmulhrsw m4, m7
27167 pmaddubsw m5, m2, m6
27168 pmulhrsw m5, m7
27169 packuswb m4, m5
27170 movu [r0 + 1388 * 16], m4
27171 pmaddubsw m4, m1, m6
27172 pmulhrsw m4, m7
27173 pmaddubsw m5, m3, m6
27174 pmulhrsw m5, m7
27175 packuswb m4, m5
27176 movu [r0 + 1389 * 16], m4
27177
27178 ; mode 23 [row 23]
27179 movu m6, [r5 + 8 * 16]
27180 pmaddubsw m4, m0, m6
27181 pmulhrsw m4, m7
27182 pmaddubsw m5, m2, m6
27183 pmulhrsw m5, m7
27184 packuswb m4, m5
27185 movu [r0 + 1390 * 16], m4
27186 pmaddubsw m4, m1, m6
27187 pmulhrsw m4, m7
27188 pmaddubsw m5, m3, m6
27189 pmulhrsw m5, m7
27190 packuswb m4, m5
27191 movu [r0 + 1391 * 16], m4
27192
27193 ; mode 23 [row 24]
27194 movu m6, [r5 + 31 * 16]
27195 pslldq m0, 2
27196 pinsrb m0, [r4 + 21], 1
27197 pinsrb m0, [r4 + 25], 0
27198 pmaddubsw m4, m0, m6
27199 pmulhrsw m4, m7
27200 pslldq m2, 2
27201 pinsrw m2, [r3 + 1], 0
27202 pmaddubsw m5, m2, m6
27203 pmulhrsw m5, m7
27204 packuswb m4, m5
27205 movu [r0 + 1392 * 16], m4
27206 pslldq m1, 2
27207 pinsrw m1, [r3 + 9], 0
27208 pmaddubsw m4, m1, m6
27209 pmulhrsw m4, m7
27210 pslldq m3, 2
27211 pinsrw m3, [r3 + 17], 0
27212 pmaddubsw m5, m3, m6
27213 pmulhrsw m5, m7
27214 packuswb m4, m5
27215 movu [r0 + 1393 * 16], m4
27216
27217 ; mode 23 [row 25]
27218 movu m6, [r5 + 22 * 16]
27219 pmaddubsw m4, m0, m6
27220 pmulhrsw m4, m7
27221 pmaddubsw m5, m2, m6
27222 pmulhrsw m5, m7
27223 packuswb m4, m5
27224 movu [r0 + 1394 * 16], m4
27225 pmaddubsw m4, m1, m6
27226 pmulhrsw m4, m7
27227 pmaddubsw m5, m3, m6
27228 pmulhrsw m5, m7
27229 packuswb m4, m5
27230 movu [r0 + 1395 * 16], m4
27231
27232 ; mode 23 [row 26]
27233 movu m6, [r5 + 13 * 16]
27234 pmaddubsw m4, m0, m6
27235 pmulhrsw m4, m7
27236 pmaddubsw m5, m2, m6
27237 pmulhrsw m5, m7
27238 packuswb m4, m5
27239 movu [r0 + 1396 * 16], m4
27240 pmaddubsw m4, m1, m6
27241 pmulhrsw m4, m7
27242 pmaddubsw m5, m3, m6
27243 pmulhrsw m5, m7
27244 packuswb m4, m5
27245 movu [r0 + 1397 * 16], m4
27246
27247 ; mode 23 [row 27]
27248 movu m6, [r5 + 4 * 16]
27249 pmaddubsw m4, m0, m6
27250 pmulhrsw m4, m7
27251 pmaddubsw m5, m2, m6
27252 pmulhrsw m5, m7
27253 packuswb m4, m5
27254 movu [r0 + 1398 * 16], m4
27255 pmaddubsw m4, m1, m6
27256 pmulhrsw m4, m7
27257 pmaddubsw m5, m3, m6
27258 pmulhrsw m5, m7
27259 packuswb m4, m5
27260 movu [r0 + 1399 * 16], m4
27261
27262 ; mode 23 [row 28]
27263 movu m6, [r5 + 27 * 16]
27264 pslldq m0, 2
27265 pinsrb m0, [r4 + 25], 1
27266 pinsrb m0, [r4 + 28], 0
27267 pmaddubsw m4, m0, m6
27268 pmulhrsw m4, m7
27269 pslldq m2, 2
27270 pinsrw m2, [r3 + 0], 0
27271 pmaddubsw m5, m2, m6
27272 pmulhrsw m5, m7
27273 packuswb m4, m5
27274 movu [r0 + 1400 * 16], m4
27275 pslldq m1, 2
27276 pinsrw m1, [r3 + 8], 0
27277 pmaddubsw m4, m1, m6
27278 pmulhrsw m4, m7
27279 pslldq m3, 2
27280 pinsrw m3, [r3 + 16], 0
27281 pmaddubsw m5, m3, m6
27282 pmulhrsw m5, m7
27283 packuswb m4, m5
27284 movu [r0 + 1401 * 16], m4
27285
27286 ; mode 23 [row 29]
27287 movu m6, [r5 + 18 * 16]
27288 pmaddubsw m4, m0, m6
27289 pmulhrsw m4, m7
27290 pmaddubsw m5, m2, m6
27291 pmulhrsw m5, m7
27292 packuswb m4, m5
27293 movu [r0 + 1402 * 16], m4
27294 pmaddubsw m4, m1, m6
27295 pmulhrsw m4, m7
27296 pmaddubsw m5, m3, m6
27297 pmulhrsw m5, m7
27298 packuswb m4, m5
27299 movu [r0 + 1403 * 16], m4
27300
27301 ; mode 23 [row 30]
27302 movu m6, [r5 + 9 * 16]
27303 pmaddubsw m4, m0, m6
27304 pmulhrsw m4, m7
27305 pmaddubsw m5, m2, m6
27306 pmulhrsw m5, m7
27307 packuswb m4, m5
27308 movu [r0 + 1404 * 16], m4
27309 pmaddubsw m4, m1, m6
27310 pmulhrsw m4, m7
27311 pmaddubsw m5, m3, m6
27312 pmulhrsw m5, m7
27313 packuswb m4, m5
27314 movu [r0 + 1405 * 16], m4
27315
27316 ; mode23 [row 31]
27317 pshufb m5, m0, [tab_S2]
27318 movh [r0 + 1406 * 16], m5
27319 pshufb m5, m2, [tab_S2]
27320 movh [r0 + 1406 * 16 + 8], m5
27321 pshufb m5, m1, [tab_S2]
27322 movh [r0 + 1407 * 16], m5
27323 pshufb m5, m3, [tab_S2]
27324 movh [r0 + 1407 * 16 + 8], m5
27325
27326 ; mode 24 [row 0]
27327 movu m6, [r5 + 27 * 16]
27328 movu m0, [r3 ]
27329 movu m1, [r3 + 1 ]
27330 punpcklbw m0, m1
27331 pmaddubsw m4, m0, m6
27332 pmulhrsw m4, m7
27333 movu m2, [r3 + 8]
27334 movu m3, [r3 + 9]
27335 punpcklbw m2, m3
27336 pmaddubsw m5, m2, m6
27337 pmulhrsw m5, m7
27338 packuswb m4, m5
27339 movu [r0 + 1408 * 16], m4
27340
27341 movu m1, [r3 + 16]
27342 movu m3, [r3 + 17]
27343 punpcklbw m1, m3
27344 pmaddubsw m4, m1, m6
27345 pmulhrsw m4, m7
27346 movu m3, [r3 + 24]
27347 movu m5, [r3 + 25]
27348 punpcklbw m3, m5
27349 pmaddubsw m5, m3, m6
27350 pmulhrsw m5, m7
27351 packuswb m4, m5
27352 movu [r0 + 1409 * 16], m4
27353
27354 ; mode 24 [row 1]
27355 movu m6, [r5 + 22 * 16]
27356 pmaddubsw m4, m0, m6
27357 pmulhrsw m4, m7
27358 pmaddubsw m5, m2, m6
27359 pmulhrsw m5, m7
27360 packuswb m4, m5
27361 movu [r0 + 1410 * 16], m4
27362 pmaddubsw m4, m1, m6
27363 pmulhrsw m4, m7
27364 pmaddubsw m5, m3, m6
27365 pmulhrsw m5, m7
27366 packuswb m4, m5
27367 movu [r0 + 1411 * 16], m4
27368
27369 ; mode 24 [row 2]
27370 movu m6, [r5 + 17 * 16]
27371 pmaddubsw m4, m0, m6
27372 pmulhrsw m4, m7
27373 pmaddubsw m5, m2, m6
27374 pmulhrsw m5, m7
27375 packuswb m4, m5
27376 movu [r0 + 1412 * 16], m4
27377 pmaddubsw m4, m1, m6
27378 pmulhrsw m4, m7
27379 pmaddubsw m5, m3, m6
27380 pmulhrsw m5, m7
27381 packuswb m4, m5
27382 movu [r0 + 1413 * 16], m4
27383
27384 ; mode 24 [row 3]
27385 movu m6, [r5 + 12 * 16]
27386 pmaddubsw m4, m0, m6
27387 pmulhrsw m4, m7
27388 pmaddubsw m5, m2, m6
27389 pmulhrsw m5, m7
27390 packuswb m4, m5
27391 movu [r0 + 1414 * 16], m4
27392 pmaddubsw m4, m1, m6
27393 pmulhrsw m4, m7
27394 pmaddubsw m5, m3, m6
27395 pmulhrsw m5, m7
27396 packuswb m4, m5
27397 movu [r0 + 1415 * 16], m4
27398
27399 ; mode 24 [row 4]
27400 movu m6, [r5 + 7 * 16]
27401 pmaddubsw m4, m0, m6
27402 pmulhrsw m4, m7
27403 pmaddubsw m5, m2, m6
27404 pmulhrsw m5, m7
27405 packuswb m4, m5
27406 movu [r0 + 1416 * 16], m4
27407 pmaddubsw m4, m1, m6
27408 pmulhrsw m4, m7
27409 pmaddubsw m5, m3, m6
27410 pmulhrsw m5, m7
27411 packuswb m4, m5
27412 movu [r0 + 1417 * 16], m4
27413
27414 ; mode 24 [row 5]
27415 movu m6, [r5 + 2 * 16]
27416 pmaddubsw m4, m0, m6
27417 pmulhrsw m4, m7
27418 pmaddubsw m5, m2, m6
27419 pmulhrsw m5, m7
27420 packuswb m4, m5
27421 movu [r0 + 1418 * 16], m4
27422 pmaddubsw m4, m1, m6
27423 pmulhrsw m4, m7
27424 pmaddubsw m5, m3, m6
27425 pmulhrsw m5, m7
27426 packuswb m4, m5
27427 movu [r0 + 1419 * 16], m4
27428
27429 ; mode 24 [row 6]
27430 movu m6, [r5 + 29 * 16]
27431 pslldq m0, 2
27432 pinsrb m0, [r4 + 0], 1
27433 pinsrb m0, [r4 + 6], 0
27434 pmaddubsw m4, m0, m6
27435 pmulhrsw m4, m7
27436 pslldq m2, 2
27437 pinsrw m2, [r3 + 7], 0
27438 pmaddubsw m5, m2, m6
27439 pmulhrsw m5, m7
27440 packuswb m4, m5
27441 movu [r0 + 1420 * 16], m4
27442 pslldq m1, 2
27443 pinsrw m1, [r3 + 15], 0
27444 pmaddubsw m4, m1, m6
27445 pmulhrsw m4, m7
27446 pslldq m3, 2
27447 pinsrw m3, [r3 + 23], 0
27448 pmaddubsw m5, m3, m6
27449 pmulhrsw m5, m7
27450 packuswb m4, m5
27451 movu [r0 + 1421 * 16], m4
27452
27453 ; mode 24 [row 7]
27454 movu m6, [r5 + 24 * 16]
27455 pmaddubsw m4, m0, m6
27456 pmulhrsw m4, m7
27457 pmaddubsw m5, m2, m6
27458 pmulhrsw m5, m7
27459 packuswb m4, m5
27460 movu [r0 + 1422 * 16], m4
27461 pmaddubsw m4, m1, m6
27462 pmulhrsw m4, m7
27463 pmaddubsw m5, m3, m6
27464 pmulhrsw m5, m7
27465 packuswb m4, m5
27466 movu [r0 + 1423 * 16], m4
27467
27468 ; mode 24 [row 8]
27469 movu m6, [r5 + 19 * 16]
27470 pmaddubsw m4, m0, m6
27471 pmulhrsw m4, m7
27472 pmaddubsw m5, m2, m6
27473 pmulhrsw m5, m7
27474 packuswb m4, m5
27475 movu [r0 + 1424 * 16], m4
27476 pmaddubsw m4, m1, m6
27477 pmulhrsw m4, m7
27478 pmaddubsw m5, m3, m6
27479 pmulhrsw m5, m7
27480 packuswb m4, m5
27481 movu [r0 + 1425 * 16], m4
27482
27483 ; mode 24 [row 9]
27484 movu m6, [r5 + 14 * 16]
27485 pmaddubsw m4, m0, m6
27486 pmulhrsw m4, m7
27487 pmaddubsw m5, m2, m6
27488 pmulhrsw m5, m7
27489 packuswb m4, m5
27490 movu [r0 + 1426 * 16], m4
27491 pmaddubsw m4, m1, m6
27492 pmulhrsw m4, m7
27493 pmaddubsw m5, m3, m6
27494 pmulhrsw m5, m7
27495 packuswb m4, m5
27496 movu [r0 + 1427 * 16], m4
27497
27498 ; mode 24 [row 10]
27499 movu m6, [r5 + 9 * 16]
27500 pmaddubsw m4, m0, m6
27501 pmulhrsw m4, m7
27502 pmaddubsw m5, m2, m6
27503 pmulhrsw m5, m7
27504 packuswb m4, m5
27505 movu [r0 + 1428 * 16], m4
27506 pmaddubsw m4, m1, m6
27507 pmulhrsw m4, m7
27508 pmaddubsw m5, m3, m6
27509 pmulhrsw m5, m7
27510 packuswb m4, m5
27511 movu [r0 + 1429 * 16], m4
27512
27513 ; mode 24 [row 11]
27514 movu m6, [r5 + 4 * 16]
27515 pmaddubsw m4, m0, m6
27516 pmulhrsw m4, m7
27517 pmaddubsw m5, m2, m6
27518 pmulhrsw m5, m7
27519 packuswb m4, m5
27520 movu [r0 + 1430 * 16], m4
27521 pmaddubsw m4, m1, m6
27522 pmulhrsw m4, m7
27523 pmaddubsw m5, m3, m6
27524 pmulhrsw m5, m7
27525 packuswb m4, m5
27526 movu [r0 + 1431 * 16], m4
27527
27528 ; mode 24 [row 12]
27529 movu m6, [r5 + 31 * 16]
27530 pslldq m0, 2
27531 pinsrb m0, [r4 + 6], 1
27532 pinsrb m0, [r4 + 13], 0
27533 pmaddubsw m4, m0, m6
27534 pmulhrsw m4, m7
27535 pslldq m2, 2
27536 pinsrw m2, [r3 + 6], 0
27537 pmaddubsw m5, m2, m6
27538 pmulhrsw m5, m7
27539 packuswb m4, m5
27540 movu [r0 + 1432 * 16], m4
27541 pslldq m1, 2
27542 pinsrw m1, [r3 + 14], 0
27543 pmaddubsw m4, m1, m6
27544 pmulhrsw m4, m7
27545 pslldq m3, 2
27546 pinsrw m3, [r3 + 22], 0
27547 pmaddubsw m5, m3, m6
27548 pmulhrsw m5, m7
27549 packuswb m4, m5
27550 movu [r0 + 1433 * 16], m4
27551
27552 ; mode 24 [row 13]
27553 movu m6, [r5 + 26 * 16]
27554 pmaddubsw m4, m0, m6
27555 pmulhrsw m4, m7
27556 pmaddubsw m5, m2, m6
27557 pmulhrsw m5, m7
27558 packuswb m4, m5
27559 movu [r0 + 1434 * 16], m4
27560 pmaddubsw m4, m1, m6
27561 pmulhrsw m4, m7
27562 pmaddubsw m5, m3, m6
27563 pmulhrsw m5, m7
27564 packuswb m4, m5
27565 movu [r0 + 1435 * 16], m4
27566
27567 ; mode 24 [row 14]
27568 movu m6, [r5 + 21 * 16]
27569 pmaddubsw m4, m0, m6
27570 pmulhrsw m4, m7
27571 pmaddubsw m5, m2, m6
27572 pmulhrsw m5, m7
27573 packuswb m4, m5
27574 movu [r0 + 1436 * 16], m4
27575 pmaddubsw m4, m1, m6
27576 pmulhrsw m4, m7
27577 pmaddubsw m5, m3, m6
27578 pmulhrsw m5, m7
27579 packuswb m4, m5
27580 movu [r0 + 1437 * 16], m4
27581
27582 ; mode 24 [row 15]
27583 movu m6, [r5 + 16 * 16]
27584 pmaddubsw m4, m0, m6
27585 pmulhrsw m4, m7
27586 pmaddubsw m5, m2, m6
27587 pmulhrsw m5, m7
27588 packuswb m4, m5
27589 movu [r0 + 1438 * 16], m4
27590 pmaddubsw m4, m1, m6
27591 pmulhrsw m4, m7
27592 pmaddubsw m5, m3, m6
27593 pmulhrsw m5, m7
27594 packuswb m4, m5
27595 movu [r0 + 1439 * 16], m4
27596
27597 ; mode 24 [row 16]
27598 movu m6, [r5 + 11 * 16]
27599 pmaddubsw m4, m0, m6
27600 pmulhrsw m4, m7
27601 pmaddubsw m5, m2, m6
27602 pmulhrsw m5, m7
27603 packuswb m4, m5
27604 movu [r0 + 1440 * 16], m4
27605 pmaddubsw m4, m1, m6
27606 pmulhrsw m4, m7
27607 pmaddubsw m5, m3, m6
27608 pmulhrsw m5, m7
27609 packuswb m4, m5
27610 movu [r0 + 1441 * 16], m4
27611
27612 ; mode 24 [row 17]
27613 movu m6, [r5 + 6 * 16]
27614 pmaddubsw m4, m0, m6
27615 pmulhrsw m4, m7
27616 pmaddubsw m5, m2, m6
27617 pmulhrsw m5, m7
27618 packuswb m4, m5
27619 movu [r0 + 1442 * 16], m4
27620 pmaddubsw m4, m1, m6
27621 pmulhrsw m4, m7
27622 pmaddubsw m5, m3, m6
27623 pmulhrsw m5, m7
27624 packuswb m4, m5
27625 movu [r0 + 1443 * 16], m4
27626
27627 ; mode 24 [row 18]
27628 movu m6, [r5 + 1 * 16]
27629 pmaddubsw m4, m0, m6
27630 pmulhrsw m4, m7
27631 pmaddubsw m5, m2, m6
27632 pmulhrsw m5, m7
27633 packuswb m4, m5
27634 movu [r0 + 1444 * 16], m4
27635 pmaddubsw m4, m1, m6
27636 pmulhrsw m4, m7
27637 pmaddubsw m5, m3, m6
27638 pmulhrsw m5, m7
27639 packuswb m4, m5
27640 movu [r0 + 1445 * 16], m4
27641
27642 ; mode 24 [row 19]
27643 movu m6, [r5 + 28 * 16]
27644 pslldq m0, 2
27645 pinsrb m0, [r4 + 13], 1
27646 pinsrb m0, [r4 + 19], 0
27647 pmaddubsw m4, m0, m6
27648 pmulhrsw m4, m7
27649 pslldq m2, 2
27650 pinsrw m2, [r3 + 5], 0
27651 pmaddubsw m5, m2, m6
27652 pmulhrsw m5, m7
27653 packuswb m4, m5
27654 movu [r0 + 1446 * 16], m4
27655 pslldq m1, 2
27656 pinsrw m1, [r3 + 13], 0
27657 pmaddubsw m4, m1, m6
27658 pmulhrsw m4, m7
27659 pslldq m3, 2
27660 pinsrw m3, [r3 + 21], 0
27661 pmaddubsw m5, m3, m6
27662 pmulhrsw m5, m7
27663 packuswb m4, m5
27664 movu [r0 + 1447 * 16], m4
27665
27666 ; mode 24 [row 20]
27667 movu m6, [r5 + 23 * 16]
27668 pmaddubsw m4, m0, m6
27669 pmulhrsw m4, m7
27670 pmaddubsw m5, m2, m6
27671 pmulhrsw m5, m7
27672 packuswb m4, m5
27673 movu [r0 + 1448 * 16], m4
27674 pmaddubsw m4, m1, m6
27675 pmulhrsw m4, m7
27676 pmaddubsw m5, m3, m6
27677 pmulhrsw m5, m7
27678 packuswb m4, m5
27679 movu [r0 + 1449 * 16], m4
27680
27681 ; mode 24 [row 21]
27682 movu m6, [r5 + 18 * 16]
27683 pmaddubsw m4, m0, m6
27684 pmulhrsw m4, m7
27685 pmaddubsw m5, m2, m6
27686 pmulhrsw m5, m7
27687 packuswb m4, m5
27688 movu [r0 + 1450 * 16], m4
27689 pmaddubsw m4, m1, m6
27690 pmulhrsw m4, m7
27691 pmaddubsw m5, m3, m6
27692 pmulhrsw m5, m7
27693 packuswb m4, m5
27694 movu [r0 + 1451 * 16], m4
27695
27696 ; mode 24 [row 22]
27697 movu m6, [r5 + 13 * 16]
27698 pmaddubsw m4, m0, m6
27699 pmulhrsw m4, m7
27700 pmaddubsw m5, m2, m6
27701 pmulhrsw m5, m7
27702 packuswb m4, m5
27703 movu [r0 + 1452 * 16], m4
27704 pmaddubsw m4, m1, m6
27705 pmulhrsw m4, m7
27706 pmaddubsw m5, m3, m6
27707 pmulhrsw m5, m7
27708 packuswb m4, m5
27709 movu [r0 + 1453 * 16], m4
27710
27711 ; mode 24 [row 23]
27712 movu m6, [r5 + 8 * 16]
27713 pmaddubsw m4, m0, m6
27714 pmulhrsw m4, m7
27715 pmaddubsw m5, m2, m6
27716 pmulhrsw m5, m7
27717 packuswb m4, m5
27718 movu [r0 + 1454 * 16], m4
27719 pmaddubsw m4, m1, m6
27720 pmulhrsw m4, m7
27721 pmaddubsw m5, m3, m6
27722 pmulhrsw m5, m7
27723 packuswb m4, m5
27724 movu [r0 + 1455 * 16], m4
27725
27726 ; mode 24 [row 24]
27727 movu m6, [r5 + 3 * 16]
27728 pmaddubsw m4, m0, m6
27729 pmulhrsw m4, m7
27730 pmaddubsw m5, m2, m6
27731 pmulhrsw m5, m7
27732 packuswb m4, m5
27733 movu [r0 + 1456 * 16], m4
27734 pmaddubsw m4, m1, m6
27735 pmulhrsw m4, m7
27736 pmaddubsw m5, m3, m6
27737 pmulhrsw m5, m7
27738 packuswb m4, m5
27739 movu [r0 + 1457 * 16], m4
27740
27741 ; mode 24 [row 25]
27742 movu m6, [r5 + 30 * 16]
27743 pslldq m0, 2
27744 pinsrb m0, [r4 + 19], 1
27745 pinsrb m0, [r4 + 26], 0
27746 pmaddubsw m4, m0, m6
27747 pmulhrsw m4, m7
27748 pslldq m2, 2
27749 pinsrw m2, [r3 + 4], 0
27750 pmaddubsw m5, m2, m6
27751 pmulhrsw m5, m7
27752 packuswb m4, m5
27753 movu [r0 + 1458 * 16], m4
27754 pslldq m1, 2
27755 pinsrw m1, [r3 + 12], 0
27756 pmaddubsw m4, m1, m6
27757 pmulhrsw m4, m7
27758 pslldq m3, 2
27759 pinsrw m3, [r3 + 20], 0
27760 pmaddubsw m5, m3, m6
27761 pmulhrsw m5, m7
27762 packuswb m4, m5
27763 movu [r0 + 1459 * 16], m4
27764
27765 ; mode 24 [row 26]
27766 movu m6, [r5 + 25 * 16]
27767 pmaddubsw m4, m0, m6
27768 pmulhrsw m4, m7
27769 pmaddubsw m5, m2, m6
27770 pmulhrsw m5, m7
27771 packuswb m4, m5
27772 movu [r0 + 1460 * 16], m4
27773 pmaddubsw m4, m1, m6
27774 pmulhrsw m4, m7
27775 pmaddubsw m5, m3, m6
27776 pmulhrsw m5, m7
27777 packuswb m4, m5
27778 movu [r0 + 1461 * 16], m4
27779
27780 ; mode 24 [row 27]
27781 movu m6, [r5 + 20 * 16]
27782 pmaddubsw m4, m0, m6
27783 pmulhrsw m4, m7
27784 pmaddubsw m5, m2, m6
27785 pmulhrsw m5, m7
27786 packuswb m4, m5
27787 movu [r0 + 1462 * 16], m4
27788 pmaddubsw m4, m1, m6
27789 pmulhrsw m4, m7
27790 pmaddubsw m5, m3, m6
27791 pmulhrsw m5, m7
27792 packuswb m4, m5
27793 movu [r0 + 1463 * 16], m4
27794
27795 ; mode 24 [row 28]
27796 movu m6, [r5 + 15 * 16]
27797 pmaddubsw m4, m0, m6
27798 pmulhrsw m4, m7
27799 pmaddubsw m5, m2, m6
27800 pmulhrsw m5, m7
27801 packuswb m4, m5
27802 movu [r0 + 1464 * 16], m4
27803 pmaddubsw m4, m1, m6
27804 pmulhrsw m4, m7
27805 pmaddubsw m5, m3, m6
27806 pmulhrsw m5, m7
27807 packuswb m4, m5
27808 movu [r0 + 1465 * 16], m4
27809
27810 ; mode 24 [row 29]
27811 movu m6, [r5 + 10 * 16]
27812 pmaddubsw m4, m0, m6
27813 pmulhrsw m4, m7
27814 pmaddubsw m5, m2, m6
27815 pmulhrsw m5, m7
27816 packuswb m4, m5
27817 movu [r0 + 1466 * 16], m4
27818 pmaddubsw m4, m1, m6
27819 pmulhrsw m4, m7
27820 pmaddubsw m5, m3, m6
27821 pmulhrsw m5, m7
27822 packuswb m4, m5
27823 movu [r0 + 1467 * 16], m4
27824
27825 ; mode 24 [row 30]
27826 movu m6, [r5 + 5 * 16]
27827 pmaddubsw m4, m0, m6
27828 pmulhrsw m4, m7
27829 pmaddubsw m5, m2, m6
27830 pmulhrsw m5, m7
27831 packuswb m4, m5
27832 movu [r0 + 1468 * 16], m4
27833 pmaddubsw m4, m1, m6
27834 pmulhrsw m4, m7
27835 pmaddubsw m5, m3, m6
27836 pmulhrsw m5, m7
27837 packuswb m4, m5
27838 movu [r0 + 1469 * 16], m4
27839
27840 ; mode 24 [row 31]
27841 pshufb m5, m0, [tab_S2]
27842 movh [r0 + 1470 * 16], m5
27843 pshufb m5, m2, [tab_S2]
27844 movh [r0 + 1470 * 16 + 8], m5
27845 pshufb m5, m1, [tab_S2]
27846 movh [r0 + 1471 * 16], m5
27847 pshufb m5, m3, [tab_S2]
27848 movh [r0 + 1471 * 16 + 8], m5
27849
27850 ; mode 25 [row 0]
27851 movu m6, [r5 + 30 * 16]
27852 movu m0, [r3 ]
27853 movu m1, [r3 + 1 ]
27854 punpcklbw m0, m1
27855 pmaddubsw m4, m0, m6
27856 pmulhrsw m4, m7
27857 movu m2, [r3 + 8]
27858 movu m3, [r3 + 9]
27859 punpcklbw m2, m3
27860 pmaddubsw m5, m2, m6
27861 pmulhrsw m5, m7
27862 packuswb m4, m5
27863 movu [r0 + 1472 * 16], m4
27864
27865 movu m1, [r3 + 16]
27866 movu m3, [r3 + 17]
27867 punpcklbw m1, m3
27868 pmaddubsw m4, m1, m6
27869 pmulhrsw m4, m7
27870 movu m3, [r3 + 24]
27871 movu m5, [r3 + 25]
27872 punpcklbw m3, m5
27873 pmaddubsw m5, m3, m6
27874 pmulhrsw m5, m7
27875 packuswb m4, m5
27876 movu [r0 + 1473 * 16], m4
27877
27878 ; mode 25 [row 1]
27879 movu m6, [r5 + 28 * 16]
27880 pmaddubsw m4, m0, m6
27881 pmulhrsw m4, m7
27882 pmaddubsw m5, m2, m6
27883 pmulhrsw m5, m7
27884 packuswb m4, m5
27885 movu [r0 + 1474 * 16], m4
27886 pmaddubsw m4, m1, m6
27887 pmulhrsw m4, m7
27888 pmaddubsw m5, m3, m6
27889 pmulhrsw m5, m7
27890 packuswb m4, m5
27891 movu [r0 + 1475 * 16], m4
27892
27893 ; mode 25 [row 2]
27894 movu m6, [r5 + 26 * 16]
27895 pmaddubsw m4, m0, m6
27896 pmulhrsw m4, m7
27897 pmaddubsw m5, m2, m6
27898 pmulhrsw m5, m7
27899 packuswb m4, m5
27900 movu [r0 + 1476 * 16], m4
27901 pmaddubsw m4, m1, m6
27902 pmulhrsw m4, m7
27903 pmaddubsw m5, m3, m6
27904 pmulhrsw m5, m7
27905 packuswb m4, m5
27906 movu [r0 + 1477 * 16], m4
27907
27908 ; mode 25 [row 3]
27909 movu m6, [r5 + 24 * 16]
27910 pmaddubsw m4, m0, m6
27911 pmulhrsw m4, m7
27912 pmaddubsw m5, m2, m6
27913 pmulhrsw m5, m7
27914 packuswb m4, m5
27915 movu [r0 + 1478 * 16], m4
27916 pmaddubsw m4, m1, m6
27917 pmulhrsw m4, m7
27918 pmaddubsw m5, m3, m6
27919 pmulhrsw m5, m7
27920 packuswb m4, m5
27921 movu [r0 + 1479 * 16], m4
27922
27923 ; mode 25 [row 4]
27924 movu m6, [r5 + 22 * 16]
27925 pmaddubsw m4, m0, m6
27926 pmulhrsw m4, m7
27927 pmaddubsw m5, m2, m6
27928 pmulhrsw m5, m7
27929 packuswb m4, m5
27930 movu [r0 + 1480 * 16], m4
27931 pmaddubsw m4, m1, m6
27932 pmulhrsw m4, m7
27933 pmaddubsw m5, m3, m6
27934 pmulhrsw m5, m7
27935 packuswb m4, m5
27936 movu [r0 + 1481 * 16], m4
27937
27938 ; mode 25 [row 5]
27939 movu m6, [r5 + 20 * 16]
27940 pmaddubsw m4, m0, m6
27941 pmulhrsw m4, m7
27942 pmaddubsw m5, m2, m6
27943 pmulhrsw m5, m7
27944 packuswb m4, m5
27945 movu [r0 + 1482 * 16], m4
27946 pmaddubsw m4, m1, m6
27947 pmulhrsw m4, m7
27948 pmaddubsw m5, m3, m6
27949 pmulhrsw m5, m7
27950 packuswb m4, m5
27951 movu [r0 + 1483 * 16], m4
27952
27953 ; mode 25 [row 6]
27954 movu m6, [r5 + 18 * 16]
27955 pmaddubsw m4, m0, m6
27956 pmulhrsw m4, m7
27957 pmaddubsw m5, m2, m6
27958 pmulhrsw m5, m7
27959 packuswb m4, m5
27960 movu [r0 + 1484 * 16], m4
27961 pmaddubsw m4, m1, m6
27962 pmulhrsw m4, m7
27963 pmaddubsw m5, m3, m6
27964 pmulhrsw m5, m7
27965 packuswb m4, m5
27966 movu [r0 + 1485 * 16], m4
27967
27968 ; mode 25 [row 7]
27969 movu m6, [r5 + 16 * 16]
27970 pmaddubsw m4, m0, m6
27971 pmulhrsw m4, m7
27972 pmaddubsw m5, m2, m6
27973 pmulhrsw m5, m7
27974 packuswb m4, m5
27975 movu [r0 + 1486 * 16], m4
27976 pmaddubsw m4, m1, m6
27977 pmulhrsw m4, m7
27978 pmaddubsw m5, m3, m6
27979 pmulhrsw m5, m7
27980 packuswb m4, m5
27981 movu [r0 + 1487 * 16], m4
27982
27983 ; mode 25 [row 8]
27984 movu m6, [r5 + 14 * 16]
27985 pmaddubsw m4, m0, m6
27986 pmulhrsw m4, m7
27987 pmaddubsw m5, m2, m6
27988 pmulhrsw m5, m7
27989 packuswb m4, m5
27990 movu [r0 + 1488 * 16], m4
27991 pmaddubsw m4, m1, m6
27992 pmulhrsw m4, m7
27993 pmaddubsw m5, m3, m6
27994 pmulhrsw m5, m7
27995 packuswb m4, m5
27996 movu [r0 + 1489 * 16], m4
27997
27998 ; mode 25 [row 9]
27999 movu m6, [r5 + 12 * 16]
28000 pmaddubsw m4, m0, m6
28001 pmulhrsw m4, m7
28002 pmaddubsw m5, m2, m6
28003 pmulhrsw m5, m7
28004 packuswb m4, m5
28005 movu [r0 + 1490 * 16], m4
28006 pmaddubsw m4, m1, m6
28007 pmulhrsw m4, m7
28008 pmaddubsw m5, m3, m6
28009 pmulhrsw m5, m7
28010 packuswb m4, m5
28011 movu [r0 + 1491 * 16], m4
28012
28013 ; mode 25 [row 10]
28014 movu m6, [r5 + 10 * 16]
28015 pmaddubsw m4, m0, m6
28016 pmulhrsw m4, m7
28017 pmaddubsw m5, m2, m6
28018 pmulhrsw m5, m7
28019 packuswb m4, m5
28020 movu [r0 + 1492 * 16], m4
28021 pmaddubsw m4, m1, m6
28022 pmulhrsw m4, m7
28023 pmaddubsw m5, m3, m6
28024 pmulhrsw m5, m7
28025 packuswb m4, m5
28026 movu [r0 + 1493 * 16], m4
28027
28028 ; mode 25 [row 11]
28029 movu m6, [r5 + 8 * 16]
28030 pmaddubsw m4, m0, m6
28031 pmulhrsw m4, m7
28032 pmaddubsw m5, m2, m6
28033 pmulhrsw m5, m7
28034 packuswb m4, m5
28035 movu [r0 + 1494 * 16], m4
28036 pmaddubsw m4, m1, m6
28037 pmulhrsw m4, m7
28038 pmaddubsw m5, m3, m6
28039 pmulhrsw m5, m7
28040 packuswb m4, m5
28041 movu [r0 + 1495 * 16], m4
28042
28043 ; mode 25 [row 12]
28044 movu m6, [r5 + 6 * 16]
28045 pmaddubsw m4, m0, m6
28046 pmulhrsw m4, m7
28047 pmaddubsw m5, m2, m6
28048 pmulhrsw m5, m7
28049 packuswb m4, m5
28050 movu [r0 + 1496 * 16], m4
28051 pmaddubsw m4, m1, m6
28052 pmulhrsw m4, m7
28053 pmaddubsw m5, m3, m6
28054 pmulhrsw m5, m7
28055 packuswb m4, m5
28056 movu [r0 + 1497 * 16], m4
28057
28058 ; mode 25 [row 13]
28059 movu m6, [r5 + 4 * 16]
28060 pmaddubsw m4, m0, m6
28061 pmulhrsw m4, m7
28062 pmaddubsw m5, m2, m6
28063 pmulhrsw m5, m7
28064 packuswb m4, m5
28065 movu [r0 + 1498 * 16], m4
28066 pmaddubsw m4, m1, m6
28067 pmulhrsw m4, m7
28068 pmaddubsw m5, m3, m6
28069 pmulhrsw m5, m7
28070 packuswb m4, m5
28071 movu [r0 + 1499 * 16], m4
28072
28073 ; mode 25 [row 14]
28074 movu m6, [r5 + 2 * 16]
28075 pmaddubsw m4, m0, m6
28076 pmulhrsw m4, m7
28077 pmaddubsw m5, m2, m6
28078 pmulhrsw m5, m7
28079 packuswb m4, m5
28080 movu [r0 + 1500 * 16], m4
28081 pmaddubsw m4, m1, m6
28082 pmulhrsw m4, m7
28083 pmaddubsw m5, m3, m6
28084 pmulhrsw m5, m7
28085 packuswb m4, m5
28086 movu [r0 + 1501 * 16], m4
28087
28088 ; mode 25 [row 15]
28089 pshufb m5, m0, [tab_S2]
28090 movh [r0 + 1502 * 16], m5
28091 pshufb m5, m2, [tab_S2]
28092 movh [r0 + 1502 * 16 + 8], m5
28093 pshufb m5, m1, [tab_S2]
28094 movh [r0 + 1503 * 16], m5
28095 pshufb m5, m3, [tab_S2]
28096 movh [r0 + 1503 * 16 + 8], m5
28097
28098 ; mode 25 [row 16]
28099 movu m6, [r5 + 30 * 16]
28100 pslldq m0, 2
28101 pinsrb m0, [r4 + 0], 1
28102 pinsrb m0, [r4 + 16], 0
28103 pmaddubsw m4, m0, m6
28104 pmulhrsw m4, m7
28105 pslldq m2, 2
28106 pinsrw m2, [r3 + 7], 0
28107 pmaddubsw m5, m2, m6
28108 pmulhrsw m5, m7
28109 packuswb m4, m5
28110 movu [r0 + 1504 * 16], m4
28111 pslldq m1, 2
28112 pinsrw m1, [r3 + 15], 0
28113 pmaddubsw m4, m1, m6
28114 pmulhrsw m4, m7
28115 pslldq m3, 2
28116 pinsrw m3, [r3 + 23], 0
28117 pmaddubsw m5, m3, m6
28118 pmulhrsw m5, m7
28119 packuswb m4, m5
28120 movu [r0 + 1505 * 16], m4
28121
28122 ; mode 25 [row 17]
28123 movu m6, [r5 + 28 * 16]
28124 pmaddubsw m4, m0, m6
28125 pmulhrsw m4, m7
28126 pmaddubsw m5, m2, m6
28127 pmulhrsw m5, m7
28128 packuswb m4, m5
28129 movu [r0 + 1506 * 16], m4
28130 pmaddubsw m4, m1, m6
28131 pmulhrsw m4, m7
28132 pmaddubsw m5, m3, m6
28133 pmulhrsw m5, m7
28134 packuswb m4, m5
28135 movu [r0 + 1507 * 16], m4
28136
28137 ; mode 25 [row 18]
28138 movu m6, [r5 + 26 * 16]
28139 pmaddubsw m4, m0, m6
28140 pmulhrsw m4, m7
28141 pmaddubsw m5, m2, m6
28142 pmulhrsw m5, m7
28143 packuswb m4, m5
28144 movu [r0 + 1508 * 16], m4
28145 pmaddubsw m4, m1, m6
28146 pmulhrsw m4, m7
28147 pmaddubsw m5, m3, m6
28148 pmulhrsw m5, m7
28149 packuswb m4, m5
28150 movu [r0 + 1509 * 16], m4
28151
28152 ; mode 25 [row 19]
28153 movu m6, [r5 + 24 * 16]
28154 pmaddubsw m4, m0, m6
28155 pmulhrsw m4, m7
28156 pmaddubsw m5, m2, m6
28157 pmulhrsw m5, m7
28158 packuswb m4, m5
28159 movu [r0 + 1510 * 16], m4
28160 pmaddubsw m4, m1, m6
28161 pmulhrsw m4, m7
28162 pmaddubsw m5, m3, m6
28163 pmulhrsw m5, m7
28164 packuswb m4, m5
28165 movu [r0 + 1511 * 16], m4
28166
28167 ; mode 25 [row 20]
28168 movu m6, [r5 + 22 * 16]
28169 pmaddubsw m4, m0, m6
28170 pmulhrsw m4, m7
28171 pmaddubsw m5, m2, m6
28172 pmulhrsw m5, m7
28173 packuswb m4, m5
28174 movu [r0 + 1512 * 16], m4
28175 pmaddubsw m4, m1, m6
28176 pmulhrsw m4, m7
28177 pmaddubsw m5, m3, m6
28178 pmulhrsw m5, m7
28179 packuswb m4, m5
28180 movu [r0 + 1513 * 16], m4
28181
28182 ; mode 25 [row 21]
28183 movu m6, [r5 + 20 * 16]
28184 pmaddubsw m4, m0, m6
28185 pmulhrsw m4, m7
28186 pmaddubsw m5, m2, m6
28187 pmulhrsw m5, m7
28188 packuswb m4, m5
28189 movu [r0 + 1514 * 16], m4
28190 pmaddubsw m4, m1, m6
28191 pmulhrsw m4, m7
28192 pmaddubsw m5, m3, m6
28193 pmulhrsw m5, m7
28194 packuswb m4, m5
28195 movu [r0 + 1515 * 16], m4
28196
28197 ; mode 25 [row 22]
28198 movu m6, [r5 + 18 * 16]
28199 pmaddubsw m4, m0, m6
28200 pmulhrsw m4, m7
28201 pmaddubsw m5, m2, m6
28202 pmulhrsw m5, m7
28203 packuswb m4, m5
28204 movu [r0 + 1516 * 16], m4
28205 pmaddubsw m4, m1, m6
28206 pmulhrsw m4, m7
28207 pmaddubsw m5, m3, m6
28208 pmulhrsw m5, m7
28209 packuswb m4, m5
28210 movu [r0 + 1517 * 16], m4
28211
28212 ; mode 25 [row 23]
28213 movu m6, [r5 + 16 * 16]
28214 pmaddubsw m4, m0, m6
28215 pmulhrsw m4, m7
28216 pmaddubsw m5, m2, m6
28217 pmulhrsw m5, m7
28218 packuswb m4, m5
28219 movu [r0 + 1518 * 16], m4
28220 pmaddubsw m4, m1, m6
28221 pmulhrsw m4, m7
28222 pmaddubsw m5, m3, m6
28223 pmulhrsw m5, m7
28224 packuswb m4, m5
28225 movu [r0 + 1519 * 16], m4
28226
28227 ; mode 25 [row 24]
28228 movu m6, [r5 + 14 * 16]
28229 pmaddubsw m4, m0, m6
28230 pmulhrsw m4, m7
28231 pmaddubsw m5, m2, m6
28232 pmulhrsw m5, m7
28233 packuswb m4, m5
28234 movu [r0 + 1520 * 16], m4
28235 pmaddubsw m4, m1, m6
28236 pmulhrsw m4, m7
28237 pmaddubsw m5, m3, m6
28238 pmulhrsw m5, m7
28239 packuswb m4, m5
28240 movu [r0 + 1521 * 16], m4
28241
28242 ; mode 25 [row 25]
28243 movu m6, [r5 + 12 * 16]
28244 pmaddubsw m4, m0, m6
28245 pmulhrsw m4, m7
28246 pmaddubsw m5, m2, m6
28247 pmulhrsw m5, m7
28248 packuswb m4, m5
28249 movu [r0 + 1522 * 16], m4
28250 pmaddubsw m4, m1, m6
28251 pmulhrsw m4, m7
28252 pmaddubsw m5, m3, m6
28253 pmulhrsw m5, m7
28254 packuswb m4, m5
28255 movu [r0 + 1523 * 16], m4
28256
28257 ; mode 25 [row 26]
28258 movu m6, [r5 + 10 * 16]
28259 pmaddubsw m4, m0, m6
28260 pmulhrsw m4, m7
28261 pmaddubsw m5, m2, m6
28262 pmulhrsw m5, m7
28263 packuswb m4, m5
28264 movu [r0 + 1524 * 16], m4
28265 pmaddubsw m4, m1, m6
28266 pmulhrsw m4, m7
28267 pmaddubsw m5, m3, m6
28268 pmulhrsw m5, m7
28269 packuswb m4, m5
28270 movu [r0 + 1525 * 16], m4
28271
28272 ; mode 25 [row 27]
28273 movu m6, [r5 + 8 * 16]
28274 pmaddubsw m4, m0, m6
28275 pmulhrsw m4, m7
28276 pmaddubsw m5, m2, m6
28277 pmulhrsw m5, m7
28278 packuswb m4, m5
28279 movu [r0 + 1526 * 16], m4
28280 pmaddubsw m4, m1, m6
28281 pmulhrsw m4, m7
28282 pmaddubsw m5, m3, m6
28283 pmulhrsw m5, m7
28284 packuswb m4, m5
28285 movu [r0 + 1527 * 16], m4
28286
28287 ; mode 25 [row 28]
28288 movu m6, [r5 + 6 * 16]
28289 pmaddubsw m4, m0, m6
28290 pmulhrsw m4, m7
28291 pmaddubsw m5, m2, m6
28292 pmulhrsw m5, m7
28293 packuswb m4, m5
28294 movu [r0 + 1528 * 16], m4
28295 pmaddubsw m4, m1, m6
28296 pmulhrsw m4, m7
28297 pmaddubsw m5, m3, m6
28298 pmulhrsw m5, m7
28299 packuswb m4, m5
28300 movu [r0 + 1529 * 16], m4
28301
28302 ; mode 25 [row 29]
28303 movu m6, [r5 + 4 * 16]
28304 pmaddubsw m4, m0, m6
28305 pmulhrsw m4, m7
28306 pmaddubsw m5, m2, m6
28307 pmulhrsw m5, m7
28308 packuswb m4, m5
28309 movu [r0 + 1530 * 16], m4
28310 pmaddubsw m4, m1, m6
28311 pmulhrsw m4, m7
28312 pmaddubsw m5, m3, m6
28313 pmulhrsw m5, m7
28314 packuswb m4, m5
28315 movu [r0 + 1531 * 16], m4
28316
28317 ; mode 25 [row 30]
28318 movu m6, [r5 + 2 * 16]
28319 pmaddubsw m4, m0, m6
28320 pmulhrsw m4, m7
28321 pmaddubsw m5, m2, m6
28322 pmulhrsw m5, m7
28323 packuswb m4, m5
28324 movu [r0 + 1532 * 16], m4
28325 pmaddubsw m4, m1, m6
28326 pmulhrsw m4, m7
28327 pmaddubsw m5, m3, m6
28328 pmulhrsw m5, m7
28329 packuswb m4, m5
28330 movu [r0 + 1533 * 16], m4
28331
28332 ; mode 25 [row 31]
28333 pshufb m5, m0, [tab_S2]
28334 movh [r0 + 1534 * 16], m5
28335 pshufb m5, m2, [tab_S2]
28336 movh [r0 + 1534 * 16 + 8], m5
28337 pshufb m5, m1, [tab_S2]
28338 movh [r0 + 1535 * 16], m5
28339 pshufb m5, m3, [tab_S2]
28340 movh [r0 + 1535 * 16 + 8], m5
28341
28342 ; mode 26
28343 movu m1, [r1 + 1]
28344 movu m2, [r1 + 17]
28345 movu [r0 + 1536 * 16], m1
28346 movu [r0 + 1537 * 16], m2
28347 movu [r0 + 1538 * 16], m1
28348 movu [r0 + 1539 * 16], m2
28349 movu [r0 + 1540 * 16], m1
28350 movu [r0 + 1541 * 16], m2
28351 movu [r0 + 1542 * 16], m1
28352 movu [r0 + 1543 * 16], m2
28353 movu [r0 + 1544 * 16], m1
28354 movu [r0 + 1545 * 16], m2
28355 movu [r0 + 1546 * 16], m1
28356 movu [r0 + 1547 * 16], m2
28357 movu [r0 + 1548 * 16], m1
28358 movu [r0 + 1549 * 16], m2
28359 movu [r0 + 1550 * 16], m1
28360 movu [r0 + 1551 * 16], m2
28361
28362 movu [r0 + 1552 * 16], m1
28363 movu [r0 + 1553 * 16], m2
28364 movu [r0 + 1554 * 16], m1
28365 movu [r0 + 1555 * 16], m2
28366 movu [r0 + 1556 * 16], m1
28367 movu [r0 + 1557 * 16], m2
28368 movu [r0 + 1558 * 16], m1
28369 movu [r0 + 1559 * 16], m2
28370 movu [r0 + 1560 * 16], m1
28371 movu [r0 + 1561 * 16], m2
28372 movu [r0 + 1562 * 16], m1
28373 movu [r0 + 1563 * 16], m2
28374 movu [r0 + 1564 * 16], m1
28375 movu [r0 + 1565 * 16], m2
28376 movu [r0 + 1566 * 16], m1
28377 movu [r0 + 1567 * 16], m2
28378
28379 movu [r0 + 1568 * 16], m1
28380 movu [r0 + 1569 * 16], m2
28381 movu [r0 + 1570 * 16], m1
28382 movu [r0 + 1571 * 16], m2
28383 movu [r0 + 1572 * 16], m1
28384 movu [r0 + 1573 * 16], m2
28385 movu [r0 + 1574 * 16], m1
28386 movu [r0 + 1575 * 16], m2
28387 movu [r0 + 1576 * 16], m1
28388 movu [r0 + 1577 * 16], m2
28389 movu [r0 + 1578 * 16], m1
28390 movu [r0 + 1579 * 16], m2
28391 movu [r0 + 1580 * 16], m1
28392 movu [r0 + 1581 * 16], m2
28393 movu [r0 + 1582 * 16], m1
28394 movu [r0 + 1583 * 16], m2
28395
28396 movu [r0 + 1584 * 16], m1
28397 movu [r0 + 1585 * 16], m2
28398 movu [r0 + 1586 * 16], m1
28399 movu [r0 + 1587 * 16], m2
28400 movu [r0 + 1588 * 16], m1
28401 movu [r0 + 1589 * 16], m2
28402 movu [r0 + 1590 * 16], m1
28403 movu [r0 + 1591 * 16], m2
28404 movu [r0 + 1592 * 16], m1
28405 movu [r0 + 1593 * 16], m2
28406 movu [r0 + 1594 * 16], m1
28407 movu [r0 + 1595 * 16], m2
28408 movu [r0 + 1596 * 16], m1
28409 movu [r0 + 1597 * 16], m2
28410 movu [r0 + 1598 * 16], m1
28411 movu [r0 + 1599 * 16], m2
28412
28413 ; mode 27 [row 0]
28414 movu m6, [r5 + 2 * 16]
28415 movu m0, [r3 + 1 ]
28416 movu m1, [r3 + 2 ]
28417 punpcklbw m0, m1
28418 pmaddubsw m4, m0, m6
28419 pmulhrsw m4, m7
28420 movu m2, [r3 + 9]
28421 movu m3, [r3 + 10]
28422 punpcklbw m2, m3
28423 pmaddubsw m5, m2, m6
28424 pmulhrsw m5, m7
28425 packuswb m4, m5
28426 movu [r0 + 1600 * 16], m4
28427
28428 movu m1, [r3 + 17]
28429 movu m3, [r3 + 18]
28430 punpcklbw m1, m3
28431 pmaddubsw m4, m1, m6
28432 pmulhrsw m4, m7
28433 movu m3, [r3 + 25]
28434 movu m5, [r3 + 26]
28435 punpcklbw m3, m5
28436 pmaddubsw m5, m3, m6
28437 pmulhrsw m5, m7
28438 packuswb m4, m5
28439 movu [r0 + 1601 * 16], m4
28440
28441 ; mode 27 [row 1]
28442 movu m6, [r5 + 4 * 16]
28443 pmaddubsw m4, m0, m6
28444 pmulhrsw m4, m7
28445 pmaddubsw m5, m2, m6
28446 pmulhrsw m5, m7
28447 packuswb m4, m5
28448 movu [r0 + 1602 * 16], m4
28449 pmaddubsw m4, m1, m6
28450 pmulhrsw m4, m7
28451 pmaddubsw m5, m3, m6
28452 pmulhrsw m5, m7
28453 packuswb m4, m5
28454 movu [r0 + 1603 * 16], m4
28455
28456 ; mode 27 [row 2]
28457 movu m6, [r5 + 6 * 16]
28458 pmaddubsw m4, m0, m6
28459 pmulhrsw m4, m7
28460 pmaddubsw m5, m2, m6
28461 pmulhrsw m5, m7
28462 packuswb m4, m5
28463 movu [r0 + 1604 * 16], m4
28464 pmaddubsw m4, m1, m6
28465 pmulhrsw m4, m7
28466 pmaddubsw m5, m3, m6
28467 pmulhrsw m5, m7
28468 packuswb m4, m5
28469 movu [r0 + 1605 * 16], m4
28470
28471 ; mode 27 [row 3]
28472 movu m6, [r5 + 8 * 16]
28473 pmaddubsw m4, m0, m6
28474 pmulhrsw m4, m7
28475 pmaddubsw m5, m2, m6
28476 pmulhrsw m5, m7
28477 packuswb m4, m5
28478 movu [r0 + 1606 * 16], m4
28479 pmaddubsw m4, m1, m6
28480 pmulhrsw m4, m7
28481 pmaddubsw m5, m3, m6
28482 pmulhrsw m5, m7
28483 packuswb m4, m5
28484 movu [r0 + 1607 * 16], m4
28485
28486 ; mode 27 [row 4]
28487 movu m6, [r5 + 10 * 16]
28488 pmaddubsw m4, m0, m6
28489 pmulhrsw m4, m7
28490 pmaddubsw m5, m2, m6
28491 pmulhrsw m5, m7
28492 packuswb m4, m5
28493 movu [r0 + 1608 * 16], m4
28494
28495 ; mode 28 [row 1 -first half]
28496 movu [r0 + 1666 * 16], m4
28497
28498 pmaddubsw m4, m1, m6
28499 pmulhrsw m4, m7
28500 pmaddubsw m5, m3, m6
28501 pmulhrsw m5, m7
28502 packuswb m4, m5
28503 movu [r0 + 1609 * 16], m4
28504
28505 ; mode 28 [row 1 - second half]
28506 movu [r0 + 1667 * 16], m4
28507
28508 ; mode 27 [row 5]
28509 movu m6, [r5 + 12 * 16]
28510 pmaddubsw m4, m0, m6
28511 pmulhrsw m4, m7
28512 pmaddubsw m5, m2, m6
28513 pmulhrsw m5, m7
28514 packuswb m4, m5
28515 movu [r0 + 1610 * 16], m4
28516
28517 pmaddubsw m4, m1, m6
28518 pmulhrsw m4, m7
28519 pmaddubsw m5, m3, m6
28520 pmulhrsw m5, m7
28521 packuswb m4, m5
28522 movu [r0 + 1611 * 16], m4
28523
28524 ; mode 27 [row 6]
28525 movu m6, [r5 + 14 * 16]
28526 pmaddubsw m4, m0, m6
28527 pmulhrsw m4, m7
28528 pmaddubsw m5, m2, m6
28529 pmulhrsw m5, m7
28530 packuswb m4, m5
28531 movu [r0 + 1612 * 16], m4
28532 pmaddubsw m4, m1, m6
28533 pmulhrsw m4, m7
28534 pmaddubsw m5, m3, m6
28535 pmulhrsw m5, m7
28536 packuswb m4, m5
28537 movu [r0 + 1613 * 16], m4
28538
28539 ; mode 27 [row 7]
28540 movu m6, [r5 + 16 * 16]
28541 pmaddubsw m4, m0, m6
28542 pmulhrsw m4, m7
28543 pmaddubsw m5, m2, m6
28544 pmulhrsw m5, m7
28545 packuswb m4, m5
28546 movu [r0 + 1614 * 16], m4
28547 pmaddubsw m4, m1, m6
28548 pmulhrsw m4, m7
28549 pmaddubsw m5, m3, m6
28550 pmulhrsw m5, m7
28551 packuswb m4, m5
28552 movu [r0 + 1615 * 16], m4
28553
28554 ; mode 27 [row 8]
28555 movu m6, [r5 + 18 * 16]
28556 pmaddubsw m4, m0, m6
28557 pmulhrsw m4, m7
28558 pmaddubsw m5, m2, m6
28559 pmulhrsw m5, m7
28560 packuswb m4, m5
28561 movu [r0 + 1616 * 16], m4
28562
28563 ; mode 29 [row 1 - first half]
28564 movu [r0 + 1730 * 16], m4
28565
28566 pmaddubsw m4, m1, m6
28567 pmulhrsw m4, m7
28568 pmaddubsw m5, m3, m6
28569 pmulhrsw m5, m7
28570 packuswb m4, m5
28571 movu [r0 + 1617 * 16], m4
28572
28573 ; mode 29 [row 1 - second half]
28574 movu [r0 + 1731 * 16], m4
28575
28576 ; mode 27 [row 9]
28577 movu m6, [r5 + 20 * 16]
28578 pmaddubsw m4, m0, m6
28579 pmulhrsw m4, m7
28580 pmaddubsw m5, m2, m6
28581 pmulhrsw m5, m7
28582 packuswb m4, m5
28583 movu [r0 + 1618 * 16], m4
28584
28585 ; mode 28 [row 3 -first half]
28586 movu [r0 + 1670 * 16], m4
28587
28588 pmaddubsw m4, m1, m6
28589 pmulhrsw m4, m7
28590 pmaddubsw m5, m3, m6
28591 pmulhrsw m5, m7
28592 packuswb m4, m5
28593 movu [r0 + 1619 * 16], m4
28594
28595 ; mode 28 [row 3 -second half]
28596 movu [r0 + 1671 * 16], m4
28597
28598 ; mode 27 [row 10]
28599 movu m6, [r5 + 22 * 16]
28600 pmaddubsw m4, m0, m6
28601 pmulhrsw m4, m7
28602 pmaddubsw m5, m2, m6
28603 pmulhrsw m5, m7
28604 packuswb m4, m5
28605 movu [r0 + 1620 * 16], m4
28606 pmaddubsw m4, m1, m6
28607 pmulhrsw m4, m7
28608 pmaddubsw m5, m3, m6
28609 pmulhrsw m5, m7
28610 packuswb m4, m5
28611 movu [r0 + 1621 * 16], m4
28612
28613 ; mode 27 [row 11]
28614 movu m6, [r5 + 24 * 16]
28615 pmaddubsw m4, m0, m6
28616 pmulhrsw m4, m7
28617 pmaddubsw m5, m2, m6
28618 pmulhrsw m5, m7
28619 packuswb m4, m5
28620 movu [r0 + 1622 * 16], m4
28621 pmaddubsw m4, m1, m6
28622 pmulhrsw m4, m7
28623 pmaddubsw m5, m3, m6
28624 pmulhrsw m5, m7
28625 packuswb m4, m5
28626 movu [r0 + 1623 * 16], m4
28627
28628 ; mode 27 [row 12]
28629 movu m6, [r5 + 26 * 16]
28630 pmaddubsw m4, m0, m6
28631 pmulhrsw m4, m7
28632 pmaddubsw m5, m2, m6
28633 pmulhrsw m5, m7
28634 packuswb m4, m5
28635 movu [r0 + 1624 * 16], m4
28636
28637 ; mode 30 [row 1 - first half]
28638 movu [r0 + 1794 * 16], m4
28639
28640 ; mode 33 [row 0 - first half]
28641 movu [r0 + 1984 * 16], m4
28642
28643 pmaddubsw m4, m1, m6
28644 pmulhrsw m4, m7
28645 pmaddubsw m5, m3, m6
28646 pmulhrsw m5, m7
28647 packuswb m4, m5
28648 movu [r0 + 1625 * 16], m4
28649
28650 ; mode 30 [row 1 - second half]
28651 movu [r0 + 1795 * 16], m4
28652
28653 ; mode 33 [row 0 - second half]
28654 movu [r0 + 1985 * 16], m4
28655
28656 ; mode 27 [row 13]
28657 movu m6, [r5 + 28 * 16]
28658 pmaddubsw m4, m0, m6
28659 pmulhrsw m4, m7
28660 pmaddubsw m5, m2, m6
28661 pmulhrsw m5, m7
28662 packuswb m4, m5
28663 movu [r0 + 1626 * 16], m4
28664 pmaddubsw m4, m1, m6
28665 pmulhrsw m4, m7
28666 pmaddubsw m5, m3, m6
28667 pmulhrsw m5, m7
28668 packuswb m4, m5
28669 movu [r0 + 1627 * 16], m4
28670
28671 ; mode 27 [row 14]
28672 movu m6, [r5 + 30 * 16]
28673 pmaddubsw m4, m0, m6
28674 pmulhrsw m4, m7
28675 pmaddubsw m5, m2, m6
28676 pmulhrsw m5, m7
28677 packuswb m4, m5
28678 movu [r0 + 1628 * 16], m4
28679
28680 ; mode 28 [row 5 first half]
28681 movu [r0 + 1674 * 16], m4
28682
28683 pmaddubsw m4, m1, m6
28684 pmulhrsw m4, m7
28685 pmaddubsw m5, m3, m6
28686 pmulhrsw m5, m7
28687 packuswb m4, m5
28688 movu [r0 + 1629 * 16], m4
28689
28690 ; mode 28 [row 5 second half]
28691 movu [r0 + 1675 * 16], m4
28692
28693 ; mode 28 [row 0]
28694 movu m6, [r5 + 5 * 16]
28695 pmaddubsw m4, m0, m6
28696 pmulhrsw m4, m7
28697 pmaddubsw m5, m2, m6
28698 pmulhrsw m5, m7
28699 packuswb m4, m5
28700 movu [r0 + 1664 * 16], m4
28701 pmaddubsw m4, m1, m6
28702 pmulhrsw m4, m7
28703 pmaddubsw m5, m3, m6
28704 pmulhrsw m5, m7
28705 packuswb m4, m5
28706 movu [r0 + 1665 * 16], m4
28707
28708 ; mode 28 [row 2]
28709 movu m6, [r5 + 15 * 16]
28710 pmaddubsw m4, m0, m6
28711 pmulhrsw m4, m7
28712 pmaddubsw m5, m2, m6
28713 pmulhrsw m5, m7
28714 packuswb m4, m5
28715 movu [r0 + 1668 * 16], m4
28716 pmaddubsw m4, m1, m6
28717 pmulhrsw m4, m7
28718 pmaddubsw m5, m3, m6
28719 pmulhrsw m5, m7
28720 packuswb m4, m5
28721 movu [r0 + 1669 * 16], m4
28722
28723 ; mode 28 [row 4]
28724 movu m6, [r5 + 25 * 16]
28725 pmaddubsw m4, m0, m6
28726 pmulhrsw m4, m7
28727 pmaddubsw m5, m2, m6
28728 pmulhrsw m5, m7
28729 packuswb m4, m5
28730 movu [r0 + 1672 * 16], m4
28731 pmaddubsw m4, m1, m6
28732 pmulhrsw m4, m7
28733 pmaddubsw m5, m3, m6
28734 pmulhrsw m5, m7
28735 packuswb m4, m5
28736 movu [r0 + 1673 * 16], m4
28737
28738 ; mode 30 [row 0]
28739 movu m6, [r5 + 13 * 16]
28740 pmaddubsw m4, m0, m6
28741 pmulhrsw m4, m7
28742 pmaddubsw m5, m2, m6
28743 pmulhrsw m5, m7
28744 packuswb m4, m5
28745 movu [r0 + 1792 * 16], m4
28746 pmaddubsw m4, m1, m6
28747 pmulhrsw m4, m7
28748 pmaddubsw m5, m3, m6
28749 pmulhrsw m5, m7
28750 packuswb m4, m5
28751 movu [r0 + 1793 * 16], m4
28752
28753 ; mode 29 [row 0]
28754 movu m6, [r5 + 9 * 16]
28755 pmaddubsw m4, m0, m6
28756 pmulhrsw m4, m7
28757 pmaddubsw m5, m2, m6
28758 pmulhrsw m5, m7
28759 packuswb m4, m5
28760 movu [r0 + 1728 * 16], m4
28761 pmaddubsw m4, m1, m6
28762 pmulhrsw m4, m7
28763 pmaddubsw m5, m3, m6
28764 pmulhrsw m5, m7
28765 packuswb m4, m5
28766 movu [r0 + 1729 * 16], m4
28767
28768 ; mode 29 [row 2]
28769 movu m6, [r5 + 27 * 16]
28770 pmaddubsw m4, m0, m6
28771 pmulhrsw m4, m7
28772 pmaddubsw m5, m2, m6
28773 pmulhrsw m5, m7
28774 packuswb m4, m5
28775 movu [r0 + 1732 * 16], m4
28776 pmaddubsw m4, m1, m6
28777 pmulhrsw m4, m7
28778 pmaddubsw m5, m3, m6
28779 pmulhrsw m5, m7
28780 packuswb m4, m5
28781 movu [r0 + 1733 * 16], m4
28782
28783 ; mode 31 [row 0]
28784 movu m6, [r5 + 17 * 16]
28785 pmaddubsw m4, m0, m6
28786 pmulhrsw m4, m7
28787 pmaddubsw m5, m2, m6
28788 pmulhrsw m5, m7
28789 packuswb m4, m5
28790 movu [r0 + 1856 * 16], m4
28791 pmaddubsw m4, m1, m6
28792 pmulhrsw m4, m7
28793 pmaddubsw m5, m3, m6
28794 pmulhrsw m5, m7
28795 packuswb m4, m5
28796 movu [r0 + 1857 * 16], m4
28797
28798 ; mode 32 [row 0]
28799 movu m6, [r5 + 21 * 16]
28800 pmaddubsw m4, m0, m6
28801 pmulhrsw m4, m7
28802 pmaddubsw m5, m2, m6
28803 pmulhrsw m5, m7
28804 packuswb m4, m5
28805 movu [r0 + 1920 * 16], m4
28806 pmaddubsw m4, m1, m6
28807 pmulhrsw m4, m7
28808 pmaddubsw m5, m3, m6
28809 pmulhrsw m5, m7
28810 packuswb m4, m5
28811 movu [r0 + 1921 * 16], m4
28812
28813 ; mode 27 [row 15]
28814 movu m0, [r3 + 2]
28815 movd m1, [r3 + 3]
28816 palignr m1, m0, 1
28817 punpcklbw m0, m1
28818 movu m2, [r3 + 10]
28819 movd m3, [r3 + 11]
28820 palignr m3, m2, 1
28821 punpcklbw m2, m3
28822 movu m1, [r3 + 18]
28823 movd m3, [r3 + 19]
28824 palignr m3, m1, 1
28825 punpcklbw m1, m3
28826 movu m4, [r3 + 26]
28827 movd m5, [r3 + 27]
28828 palignr m5, m4, 1
28829 punpcklbw m4, m5
28830
28831 pshufb m5, m0, [tab_S2]
28832 movh [r0 + 1630 * 16], m5
28833 pshufb m5, m2, [tab_S2]
28834 movh [r0 + 1630 * 16 + 8], m5
28835 pshufb m5, m1, [tab_S2]
28836 movh [r0 + 1631 * 16], m5
28837 pshufb m5, m4, [tab_S2]
28838 movh [r0 + 1631 * 16 + 8], m5
28839
28840 ; mode 27 [row 16]
28841 movu m6, [r5 + 2 * 16]
28842 pmaddubsw m3, m0, m6
28843 pmulhrsw m3, m7
28844 pmaddubsw m5, m2, m6
28845 pmulhrsw m5, m7
28846 packuswb m3, m5
28847 movu [r0 + 1632 * 16], m3
28848
28849 ; mode 31 [row 1 - first half]
28850 movu [r0 + 1858 * 16], m3
28851
28852 pmaddubsw m3, m1, m6
28853 pmulhrsw m3, m7
28854 pmaddubsw m5, m4, m6
28855 pmulhrsw m5, m7
28856 packuswb m3, m5
28857 movu [r0 + 1633 * 16], m3
28858
28859 ; mode 31 [row 1 - second half]
28860 movu [r0 + 1859 * 16], m3
28861
28862 ; mode 27 [row 17]
28863 movu m6, [r5 + 4 * 16]
28864 pmaddubsw m3, m0, m6
28865 pmulhrsw m3, m7
28866 pmaddubsw m5, m2, m6
28867 pmulhrsw m5, m7
28868 packuswb m3, m5
28869 movu [r0 + 1634 * 16], m3
28870
28871 ; mode 29 [row 3 - first half]
28872 movu [r0 + 1734 * 16], m3
28873
28874 pmaddubsw m3, m1, m6
28875 pmulhrsw m3, m7
28876 pmaddubsw m5, m4, m6
28877 pmulhrsw m5, m7
28878 packuswb m3, m5
28879 movu [r0 + 1635 * 16], m3
28880
28881 ; mode 29 [row 3 - second half]
28882 movu [r0 + 1735 * 16], m3
28883
28884 ; mode 27 [row 18]
28885 movu m6, [r5 + 6 * 16]
28886 pmaddubsw m3, m0, m6
28887 pmulhrsw m3, m7
28888 pmaddubsw m5, m2, m6
28889 pmulhrsw m5, m7
28890 packuswb m3, m5
28891 movu [r0 + 1636 * 16], m3
28892 pmaddubsw m3, m1, m6
28893 pmulhrsw m3, m7
28894 pmaddubsw m5, m4, m6
28895 pmulhrsw m5, m7
28896 packuswb m3, m5
28897 movu [r0 + 1637 * 16], m3
28898
28899 ; mode 27 [row 19]
28900 movu m6, [r5 + 8 * 16]
28901 pmaddubsw m3, m0, m6
28902 pmulhrsw m3, m7
28903 pmaddubsw m5, m2, m6
28904 pmulhrsw m5, m7
28905 packuswb m3, m5
28906 movu [r0 + 1638 * 16], m3
28907
28908 ; mode 28 [row 7 - first half]
28909 movu [r0 + 1678 * 16], m3
28910
28911 pmaddubsw m3, m1, m6
28912 pmulhrsw m3, m7
28913 pmaddubsw m5, m4, m6
28914 pmulhrsw m5, m7
28915 packuswb m3, m5
28916 movu [r0 + 1639 * 16], m3
28917
28918 ; mode 28 [row 7 - second half]
28919 movu [r0 + 1679 * 16], m3
28920
28921 ; mode 27 [row 20]
28922 movu m6, [r5 + 10 * 16]
28923 pmaddubsw m3, m0, m6
28924 pmulhrsw m3, m7
28925 pmaddubsw m5, m2, m6
28926 pmulhrsw m5, m7
28927 packuswb m3, m5
28928 movu [r0 + 1640 * 16], m3
28929
28930 ; mode 32 [row 1 - first half]
28931 movu [r0 + 1922 * 16], m3
28932
28933 pmaddubsw m3, m1, m6
28934 pmulhrsw m3, m7
28935 pmaddubsw m5, m4, m6
28936 pmulhrsw m5, m7
28937 packuswb m3, m5
28938 movu [r0 + 1641 * 16], m3
28939
28940 ; mode 32 [row 1 - second half]
28941 movu [r0 + 1923 * 16], m3
28942
28943 ; mode 27 [row 21]
28944 movu m6, [r5 + 12 * 16]
28945 pmaddubsw m3, m0, m6
28946 pmulhrsw m3, m7
28947 pmaddubsw m5, m2, m6
28948 pmulhrsw m5, m7
28949 packuswb m3, m5
28950 movu [r0 + 1642 * 16], m3
28951 pmaddubsw m3, m1, m6
28952 pmulhrsw m3, m7
28953 pmaddubsw m5, m4, m6
28954 pmulhrsw m5, m7
28955 packuswb m3, m5
28956 movu [r0 + 1643 * 16], m3
28957
28958 ; mode 27 [row 22]
28959 movu m6, [r5 + 14 * 16]
28960 pmaddubsw m3, m0, m6
28961 pmulhrsw m3, m7
28962 pmaddubsw m5, m2, m6
28963 pmulhrsw m5, m7
28964 packuswb m3, m5
28965 movu [r0 + 1644 * 16], m3
28966 pmaddubsw m3, m1, m6
28967 pmulhrsw m3, m7
28968 pmaddubsw m5, m4, m6
28969 pmulhrsw m5, m7
28970 packuswb m3, m5
28971 movu [r0 + 1645 * 16], m3
28972
28973 ; mode 27 [row 23]
28974 movu m6, [r5 + 16 * 16]
28975 pmaddubsw m3, m0, m6
28976 pmulhrsw m3, m7
28977 pmaddubsw m5, m2, m6
28978 pmulhrsw m5, m7
28979 packuswb m3, m5
28980 movu [r0 + 1646 * 16], m3
28981 pmaddubsw m3, m1, m6
28982 pmulhrsw m3, m7
28983 pmaddubsw m5, m4, m6
28984 pmulhrsw m5, m7
28985 packuswb m3, m5
28986 movu [r0 + 1647 * 16], m3
28987
28988 ; mode 27 [row 24]
28989 movu m6, [r5 + 18 * 16]
28990 pmaddubsw m3, m0, m6
28991 pmulhrsw m3, m7
28992 pmaddubsw m5, m2, m6
28993 pmulhrsw m5, m7
28994 packuswb m3, m5
28995 movu [r0 + 1648 * 16], m3
28996
28997 ; mode 28 [row 9 - first half]
28998 movu [r0 + 1682 * 16], m3
28999
29000 pmaddubsw m3, m1, m6
29001 pmulhrsw m3, m7
29002 pmaddubsw m5, m4, m6
29003 pmulhrsw m5, m7
29004 packuswb m3, m5
29005 movu [r0 + 1649 * 16], m3
29006
29007 ; mode 28 [row 9 - second half]
29008 movu [r0 + 1683 * 16], m3
29009
29010 ; mode 27 [row 25]
29011 movu m6, [r5 + 20 * 16]
29012 pmaddubsw m3, m0, m6
29013 pmulhrsw m3, m7
29014 pmaddubsw m5, m2, m6
29015 pmulhrsw m5, m7
29016 packuswb m3, m5
29017 movu [r0 + 1650 * 16], m3
29018
29019 ; mode 30 [row 3 - first half]
29020 movu [r0 + 1798 * 16], m3
29021
29022 ; mode 33 [row 1 - first half]
29023 movu [r0 + 1986 * 16], m3
29024
29025 pmaddubsw m3, m1, m6
29026 pmulhrsw m3, m7
29027 pmaddubsw m5, m4, m6
29028 pmulhrsw m5, m7
29029 packuswb m3, m5
29030 movu [r0 + 1651 * 16], m3
29031
29032 ; mode 30 [row 3 - second half]
29033 movu [r0 + 1799 * 16], m3
29034
29035 ; mode 33 [row 1 - second half]
29036 movu [r0 + 1987 * 16], m3
29037
29038 ; mode 27 [row 26]
29039 movu m6, [r5 + 22 * 16]
29040 pmaddubsw m3, m0, m6
29041 pmulhrsw m3, m7
29042 pmaddubsw m5, m2, m6
29043 pmulhrsw m5, m7
29044 packuswb m3, m5
29045 movu [r0 + 1652 * 16], m3
29046
29047 ; mode 29 [row 5 - first half]
29048 movu [r0 + 1738 * 16], m3
29049
29050 pmaddubsw m3, m1, m6
29051 pmulhrsw m3, m7
29052 pmaddubsw m5, m4, m6
29053 pmulhrsw m5, m7
29054 packuswb m3, m5
29055 movu [r0 + 1653 * 16], m3
29056
29057 ; mode 29 [row 5 - second half]
29058 movu [r0 + 1739 * 16], m3
29059
29060 ; mode 27 [row 27]
29061 movu m6, [r5 + 24 * 16]
29062 pmaddubsw m3, m0, m6
29063 pmulhrsw m3, m7
29064 pmaddubsw m5, m2, m6
29065 pmulhrsw m5, m7
29066 packuswb m3, m5
29067 movu [r0 + 1654 * 16], m3
29068 pmaddubsw m3, m1, m6
29069 pmulhrsw m3, m7
29070 pmaddubsw m5, m4, m6
29071 pmulhrsw m5, m7
29072 packuswb m3, m5
29073 movu [r0 + 1655 * 16], m3
29074
29075 ; mode 27 [row 28]
29076 movu m6, [r5 + 26 * 16]
29077 pmaddubsw m3, m0, m6
29078 pmulhrsw m3, m7
29079 pmaddubsw m5, m2, m6
29080 pmulhrsw m5, m7
29081 packuswb m3, m5
29082 movu [r0 + 1656 * 16], m3
29083 pmaddubsw m3, m1, m6
29084 pmulhrsw m3, m7
29085 pmaddubsw m5, m4, m6
29086 pmulhrsw m5, m7
29087 packuswb m3, m5
29088 movu [r0 + 1657 * 16], m3
29089
29090 ; mode 27 [row 29]
29091 movu m6, [r5 + 28 * 16]
29092 pmaddubsw m3, m0, m6
29093 pmulhrsw m3, m7
29094 pmaddubsw m5, m2, m6
29095 pmulhrsw m5, m7
29096 packuswb m3, m5
29097 movu [r0 + 1658 * 16], m3
29098
29099 ; mode 28 [row 11 - first half]
29100 movu [r0 + 1686 * 16], m3
29101
29102 pmaddubsw m3, m1, m6
29103 pmulhrsw m3, m7
29104 pmaddubsw m5, m4, m6
29105 pmulhrsw m5, m7
29106 packuswb m3, m5
29107 movu [r0 + 1659 * 16], m3
29108
29109 ; mode 28 [row 11 - second half]
29110 movu [r0 + 1687 * 16], m3
29111
29112 ; mode 27 [row 30]
29113 movu m6, [r5 + 30 * 16]
29114 pmaddubsw m3, m0, m6
29115 pmulhrsw m3, m7
29116 pmaddubsw m5, m2, m6
29117 pmulhrsw m5, m7
29118 packuswb m3, m5
29119 movu [r0 + 1660 * 16], m3
29120 pmaddubsw m3, m1, m6
29121 pmulhrsw m3, m7
29122 pmaddubsw m5, m4, m6
29123 pmulhrsw m5, m7
29124 packuswb m3, m5
29125 movu [r0 + 1661 * 16], m3
29126
29127 ; mode 28 [row 6]
29128 movu m6, [r5 + 3 * 16]
29129 pmaddubsw m3, m0, m6
29130 pmulhrsw m3, m7
29131 pmaddubsw m5, m2, m6
29132 pmulhrsw m5, m7
29133 packuswb m3, m5
29134 movu [r0 + 1676 * 16], m3
29135 pmaddubsw m3, m1, m6
29136 pmulhrsw m3, m7
29137 pmaddubsw m5, m4, m6
29138 pmulhrsw m5, m7
29139 packuswb m3, m5
29140 movu [r0 + 1677 * 16], m3
29141
29142 ; mode 28 [row 8]
29143 movu m6, [r5 + 13 * 16]
29144 pmaddubsw m3, m0, m6
29145 pmulhrsw m3, m7
29146 pmaddubsw m5, m2, m6
29147 pmulhrsw m5, m7
29148 packuswb m3, m5
29149 movu [r0 + 1680 * 16], m3
29150
29151 ; mode 29 [row 4 - first half]
29152 movu [r0 + 1736 * 16], m3
29153
29154 pmaddubsw m3, m1, m6
29155 pmulhrsw m3, m7
29156 pmaddubsw m5, m4, m6
29157 pmulhrsw m5, m7
29158 packuswb m3, m5
29159 movu [r0 + 1681 * 16], m3
29160
29161 ; mode 29 [row 4 - second half]
29162 movu [r0 + 1737 * 16], m3
29163
29164 ; mode 28 [row 10]
29165 movu m6, [r5 + 23 * 16]
29166 pmaddubsw m3, m0, m6
29167 pmulhrsw m3, m7
29168 pmaddubsw m5, m2, m6
29169 pmulhrsw m5, m7
29170 packuswb m3, m5
29171 movu [r0 + 1684 * 16], m3
29172 pmaddubsw m3, m1, m6
29173 pmulhrsw m3, m7
29174 pmaddubsw m5, m4, m6
29175 pmulhrsw m5, m7
29176 packuswb m3, m5
29177 movu [r0 + 1685 * 16], m3
29178
29179 ; mode 29 [row 6]
29180 movu m6, [r5 + 31 * 16]
29181 pmaddubsw m3, m0, m6
29182 pmulhrsw m3, m7
29183 pmaddubsw m5, m2, m6
29184 pmulhrsw m5, m7
29185 packuswb m3, m5
29186 movu [r0 + 1740 * 16], m3
29187
29188 ; mode 32 [row 2 - first half]
29189 movu [r0 + 1924 * 16], m3
29190
29191 pmaddubsw m3, m1, m6
29192 pmulhrsw m3, m7
29193 pmaddubsw m5, m4, m6
29194 pmulhrsw m5, m7
29195 packuswb m3, m5
29196 movu [r0 + 1741 * 16], m3
29197
29198 ; mode 32 [row 2 - second half]
29199 movu [r0 + 1925 * 16], m3
29200
29201 ; mode 30 [row 2]
29202 movu m6, [r5 + 7 * 16]
29203 pmaddubsw m3, m0, m6
29204 pmulhrsw m3, m7
29205 pmaddubsw m5, m2, m6
29206 pmulhrsw m5, m7
29207 packuswb m3, m5
29208 movu [r0 + 1796 * 16], m3
29209 pmaddubsw m3, m1, m6
29210 pmulhrsw m3, m7
29211 pmaddubsw m5, m4, m6
29212 pmulhrsw m5, m7
29213 packuswb m3, m5
29214 movu [r0 + 1797 * 16], m3
29215
29216 ; mode 31 [row 2]
29217 movu m6, [r5 + 19 * 16]
29218 pmaddubsw m3, m0, m6
29219 pmulhrsw m3, m7
29220 pmaddubsw m5, m2, m6
29221 pmulhrsw m5, m7
29222 packuswb m3, m5
29223 movu [r0 + 1860 * 16], m3
29224 pmaddubsw m3, m1, m6
29225 pmulhrsw m3, m7
29226 pmaddubsw m5, m4, m6
29227 pmulhrsw m5, m7
29228 packuswb m3, m5
29229 movu [r0 + 1861 * 16], m3
29230
29231 ; mode 27 [row 15]
29232 movu m0, [r3 + 3]
29233 movd m1, [r3 + 4]
29234 palignr m1, m0, 1
29235 punpcklbw m0, m1
29236 movu m2, [r3 + 11]
29237 movd m3, [r3 + 12]
29238 palignr m3, m2, 1
29239 punpcklbw m2, m3
29240 movu m1, [r3 + 19]
29241 movd m3, [r3 + 20]
29242 palignr m3, m1, 1
29243 punpcklbw m1, m3
29244 movu m4, [r3 + 27]
29245 movd m5, [r3 + 28]
29246 palignr m5, m4, 1
29247 punpcklbw m4, m5
29248
29249 pshufb m5, m0, [tab_S2]
29250 movh [r0 + 1662 * 16], m5
29251 pshufb m5, m2, [tab_S2]
29252 movh [r0 + 1662 * 16 + 8], m5
29253 pshufb m5, m1, [tab_S2]
29254 movh [r0 + 1663 * 16], m5
29255 pshufb m5, m4, [tab_S2]
29256 movh [r0 + 1663 * 16 + 8], m5
29257
29258 ; mode 28 [row 12]
29259 movu m6, [r5 + 1 * 16]
29260 pmaddubsw m3, m0, m6
29261 pmulhrsw m3, m7
29262 pmaddubsw m5, m2, m6
29263 pmulhrsw m5, m7
29264 packuswb m3, m5
29265 movu [r0 + 1688 * 16], m3
29266
29267 ; mode 30 [row 4 - first half]
29268 movu [r0 + 1800 * 16], m3
29269
29270 pmaddubsw m3, m1, m6
29271 pmulhrsw m3, m7
29272 pmaddubsw m5, m4, m6
29273 pmulhrsw m5, m7
29274 packuswb m3, m5
29275 movu [r0 + 1689 * 16], m3
29276
29277 ; mode 30 [row 4 - second half]
29278 movu [r0 + 1801 * 16], m3
29279
29280 ; mode 28 [row 13]
29281 movu m6, [r5 + 6 * 16]
29282 pmaddubsw m3, m0, m6
29283 pmulhrsw m3, m7
29284 pmaddubsw m5, m2, m6
29285 pmulhrsw m5, m7
29286 packuswb m3, m5
29287 movu [r0 + 1690 * 16], m3
29288 pmaddubsw m3, m1, m6
29289 pmulhrsw m3, m7
29290 pmaddubsw m5, m4, m6
29291 pmulhrsw m5, m7
29292 packuswb m3, m5
29293 movu [r0 + 1691 * 16], m3
29294
29295 ; mode 28 [row 14]
29296 movu m6, [r5 + 11 * 16]
29297 pmaddubsw m3, m0, m6
29298 pmulhrsw m3, m7
29299 pmaddubsw m5, m2, m6
29300 pmulhrsw m5, m7
29301 packuswb m3, m5
29302 movu [r0 + 1692 * 16], m3
29303 pmaddubsw m3, m1, m6
29304 pmulhrsw m3, m7
29305 pmaddubsw m5, m4, m6
29306 pmulhrsw m5, m7
29307 packuswb m3, m5
29308 movu [r0 + 1693 * 16], m3
29309
29310 ; mode 28 [row 15]
29311 movu m6, [r5 + 16 * 16]
29312 pmaddubsw m3, m0, m6
29313 pmulhrsw m3, m7
29314 pmaddubsw m5, m2, m6
29315 pmulhrsw m5, m7
29316 packuswb m3, m5
29317 movu [r0 + 1694 * 16], m3
29318 pmaddubsw m3, m1, m6
29319 pmulhrsw m3, m7
29320 pmaddubsw m5, m4, m6
29321 pmulhrsw m5, m7
29322 packuswb m3, m5
29323 movu [r0 + 1695 * 16], m3
29324
29325 ; mode 28 [row 16]
29326 movu m6, [r5 + 21 * 16]
29327 pmaddubsw m3, m0, m6
29328 pmulhrsw m3, m7
29329 pmaddubsw m5, m2, m6
29330 pmulhrsw m5, m7
29331 packuswb m3, m5
29332 movu [r0 + 1696 * 16], m3
29333
29334 ; mode 31 [row 4 - first half]
29335 movu [r0 + 1864 * 16], m3
29336
29337 pmaddubsw m3, m1, m6
29338 pmulhrsw m3, m7
29339 pmaddubsw m5, m4, m6
29340 pmulhrsw m5, m7
29341 packuswb m3, m5
29342 movu [r0 + 1697 * 16], m3
29343
29344 ; mode 31 [row 4 - second half]
29345 movu [r0 + 1865 * 16], m3
29346
29347 ; mode 28 [row 17]
29348 movu m6, [r5 + 26 * 16]
29349 pmaddubsw m3, m0, m6
29350 pmulhrsw m3, m7
29351 pmaddubsw m5, m2, m6
29352 pmulhrsw m5, m7
29353 packuswb m3, m5
29354 movu [r0 + 1698 * 16], m3
29355
29356 ; mode 29 [row 9 - first half]
29357 movu [r0 + 1746 * 16], m3
29358
29359 pmaddubsw m3, m1, m6
29360 pmulhrsw m3, m7
29361 pmaddubsw m5, m4, m6
29362 pmulhrsw m5, m7
29363 packuswb m3, m5
29364 movu [r0 + 1699 * 16], m3
29365
29366 ; mode 29 [row 9 - second half]
29367 movu [r0 + 1747 * 16], m3
29368
29369 ; mode 28 [row 18]
29370 movu m6, [r5 + 31 * 16]
29371 pmaddubsw m3, m0, m6
29372 pmulhrsw m3, m7
29373 pmaddubsw m5, m2, m6
29374 pmulhrsw m5, m7
29375 packuswb m3, m5
29376 movu [r0 + 1700 * 16], m3
29377 pmaddubsw m3, m1, m6
29378 pmulhrsw m3, m7
29379 pmaddubsw m5, m4, m6
29380 pmulhrsw m5, m7
29381 packuswb m3, m5
29382 movu [r0 + 1701 * 16], m3
29383
29384 ; mode 29 [row 7]
29385 movu m6, [r5 + 8 * 16]
29386 pmaddubsw m3, m0, m6
29387 pmulhrsw m3, m7
29388 pmaddubsw m5, m2, m6
29389 pmulhrsw m5, m7
29390 packuswb m3, m5
29391 movu [r0 + 1742 * 16], m3
29392 pmaddubsw m3, m1, m6
29393 pmulhrsw m3, m7
29394 pmaddubsw m5, m4, m6
29395 pmulhrsw m5, m7
29396 packuswb m3, m5
29397 movu [r0 + 1743 * 16], m3
29398
29399 ; mode 29 [row 8]
29400 movu m6, [r5 + 17 * 16]
29401 pmaddubsw m3, m0, m6
29402 pmulhrsw m3, m7
29403 pmaddubsw m5, m2, m6
29404 pmulhrsw m5, m7
29405 packuswb m3, m5
29406 movu [r0 + 1744 * 16], m3
29407 pmaddubsw m3, m1, m6
29408 pmulhrsw m3, m7
29409 pmaddubsw m5, m4, m6
29410 pmulhrsw m5, m7
29411 packuswb m3, m5
29412 movu [r0 + 1745 * 16], m3
29413
29414 ; mode 30 [row 5]
29415 movu m6, [r5 + 14 * 16]
29416 pmaddubsw m3, m0, m6
29417 pmulhrsw m3, m7
29418 pmaddubsw m5, m2, m6
29419 pmulhrsw m5, m7
29420 packuswb m3, m5
29421 movu [r0 + 1802 * 16], m3
29422
29423 ; mode 33 [row 2 - first half]
29424 movu [r0 + 1988 * 16], m3
29425
29426 pmaddubsw m3, m1, m6
29427 pmulhrsw m3, m7
29428 pmaddubsw m5, m4, m6
29429 pmulhrsw m5, m7
29430 packuswb m3, m5
29431 movu [r0 + 1803 * 16], m3
29432
29433 ; mode 33 [row 2 - second half]
29434 movu [r0 + 1989 * 16], m3
29435
29436 ; mode 30 [row 6]
29437 movu m6, [r5 + 27 * 16]
29438 pmaddubsw m3, m0, m6
29439 pmulhrsw m3, m7
29440 pmaddubsw m5, m2, m6
29441 pmulhrsw m5, m7
29442 packuswb m3, m5
29443 movu [r0 + 1804 * 16], m3
29444 pmaddubsw m3, m1, m6
29445 pmulhrsw m3, m7
29446 pmaddubsw m5, m4, m6
29447 pmulhrsw m5, m7
29448 packuswb m3, m5
29449 movu [r0 + 1805 * 16], m3
29450
29451 ; mode 31 [row 3]
29452 movu m6, [r5 + 4 * 16]
29453 pmaddubsw m3, m0, m6
29454 pmulhrsw m3, m7
29455 pmaddubsw m5, m2, m6
29456 pmulhrsw m5, m7
29457 packuswb m3, m5
29458 movu [r0 + 1862 * 16], m3
29459 pmaddubsw m3, m1, m6
29460 pmulhrsw m3, m7
29461 pmaddubsw m5, m4, m6
29462 pmulhrsw m5, m7
29463 packuswb m3, m5
29464 movu [r0 + 1863 * 16], m3
29465
29466 ; mode 32 [row 3]
29467 movu m6, [r5 + 20 * 16]
29468 pmaddubsw m3, m0, m6
29469 pmulhrsw m3, m7
29470 pmaddubsw m5, m2, m6
29471 pmulhrsw m5, m7
29472 packuswb m3, m5
29473 movu [r0 + 1926 * 16], m3
29474 pmaddubsw m3, m1, m6
29475 pmulhrsw m3, m7
29476 pmaddubsw m5, m4, m6
29477 pmulhrsw m5, m7
29478 packuswb m3, m5
29479 movu [r0 + 1927 * 16], m3
29480
29481 ; mode 28 [row 19]
29482 movu m6, [r5 + 4 * 16]
29483 movu m0, [r3 + 4]
29484 movd m1, [r3 + 5]
29485 palignr m1, m0, 1
29486 punpcklbw m0, m1
29487 pmaddubsw m3, m0, m6
29488 pmulhrsw m3, m7
29489 movu m2, [r3 + 12]
29490 movd m4, [r3 + 13]
29491 palignr m4, m2, 1
29492 punpcklbw m2, m4
29493 pmaddubsw m5, m2, m6
29494 pmulhrsw m5, m7
29495 packuswb m3, m5
29496 movu [r0 + 1702 * 16], m3
29497
29498 movu m1, [r3 + 20]
29499 movd m3, [r3 + 21]
29500 palignr m3, m1, 1
29501 punpcklbw m1, m3
29502 pmaddubsw m3, m1, m6
29503 pmulhrsw m3, m7
29504 movu m4, [r3 + 28]
29505 movd m5, [r3 + 29]
29506 palignr m5, m4, 1
29507 punpcklbw m4, m5
29508 pmaddubsw m5, m4, m6
29509 pmulhrsw m5, m7
29510 packuswb m3, m5
29511 movu [r0 + 1703 * 16], m3
29512
29513 ; mode 28 [row 20]
29514 movu m6, [r5 + 9 * 16]
29515 pmaddubsw m3, m0, m6
29516 pmulhrsw m3, m7
29517 pmaddubsw m5, m2, m6
29518 pmulhrsw m5, m7
29519 packuswb m3, m5
29520 movu [r0 + 1704 * 16], m3
29521
29522 ; mode 32 [row 4 - first half]
29523 movu [r0 + 1928 * 16], m3
29524
29525 pmaddubsw m3, m1, m6
29526 pmulhrsw m3, m7
29527 pmaddubsw m5, m4, m6
29528 pmulhrsw m5, m7
29529 packuswb m3, m5
29530 movu [r0 + 1705 * 16], m3
29531
29532 ; mode 32 [row 4 - second half]
29533 movu [r0 + 1929 * 16], m3
29534
29535 ; mode 28 [row 21]
29536 movu m6, [r5 + 14 * 16]
29537 pmaddubsw m3, m0, m6
29538 pmulhrsw m3, m7
29539 pmaddubsw m5, m2, m6
29540 pmulhrsw m5, m7
29541 packuswb m3, m5
29542 movu [r0 + 1706 * 16], m3
29543 pmaddubsw m3, m1, m6
29544 pmulhrsw m3, m7
29545 pmaddubsw m5, m4, m6
29546 pmulhrsw m5, m7
29547 packuswb m3, m5
29548 movu [r0 + 1707 * 16], m3
29549
29550 ; mode 28 [row 22]
29551 movu m6, [r5 + 19 * 16]
29552 pmaddubsw m3, m0, m6
29553 pmulhrsw m3, m7
29554 pmaddubsw m5, m2, m6
29555 pmulhrsw m5, m7
29556 packuswb m3, m5
29557 movu [r0 + 1708 * 16], m3
29558 pmaddubsw m3, m1, m6
29559 pmulhrsw m3, m7
29560 pmaddubsw m5, m4, m6
29561 pmulhrsw m5, m7
29562 packuswb m3, m5
29563 movu [r0 + 1709 * 16], m3
29564
29565 ; mode 28 [row 23]
29566 movu m6, [r5 + 24 * 16]
29567 pmaddubsw m3, m0, m6
29568 pmulhrsw m3, m7
29569 pmaddubsw m5, m2, m6
29570 pmulhrsw m5, m7
29571 packuswb m3, m5
29572 movu [r0 + 1710 * 16], m3
29573 pmaddubsw m3, m1, m6
29574 pmulhrsw m3, m7
29575 pmaddubsw m5, m4, m6
29576 pmulhrsw m5, m7
29577 packuswb m3, m5
29578 movu [r0 + 1711 * 16], m3
29579
29580 ; mode 28 [row 24]
29581 movu m6, [r5 + 29 * 16]
29582 pmaddubsw m3, m0, m6
29583 pmulhrsw m3, m7
29584 pmaddubsw m5, m2, m6
29585 pmulhrsw m5, m7
29586 packuswb m3, m5
29587 movu [r0 + 1712 * 16], m3
29588 pmaddubsw m3, m1, m6
29589 pmulhrsw m3, m7
29590 pmaddubsw m5, m4, m6
29591 pmulhrsw m5, m7
29592 packuswb m3, m5
29593 movu [r0 + 1713 * 16], m3
29594
29595 ; mode 29 [row 10]
29596 movu m6, [r5 + 3 * 16]
29597 pmaddubsw m3, m0, m6
29598 pmulhrsw m3, m7
29599 pmaddubsw m5, m2, m6
29600 pmulhrsw m5, m7
29601 packuswb m3, m5
29602 movu [r0 + 1748 * 16], m3
29603 pmaddubsw m3, m1, m6
29604 pmulhrsw m3, m7
29605 pmaddubsw m5, m4, m6
29606 pmulhrsw m5, m7
29607 packuswb m3, m5
29608 movu [r0 + 1749 * 16], m3
29609
29610 ; mode 29 [row 11]
29611 movu m6, [r5 + 12 * 16]
29612 pmaddubsw m3, m0, m6
29613 pmulhrsw m3, m7
29614 pmaddubsw m5, m2, m6
29615 pmulhrsw m5, m7
29616 packuswb m3, m5
29617 movu [r0 + 1750 * 16], m3
29618 pmaddubsw m3, m1, m6
29619 pmulhrsw m3, m7
29620 pmaddubsw m5, m4, m6
29621 pmulhrsw m5, m7
29622 packuswb m3, m5
29623 movu [r0 + 1751 * 16], m3
29624
29625 ; mode 29 [row 12]
29626 movu m6, [r5 + 21 * 16]
29627 pmaddubsw m3, m0, m6
29628 pmulhrsw m3, m7
29629 pmaddubsw m5, m2, m6
29630 pmulhrsw m5, m7
29631 packuswb m3, m5
29632 movu [r0 + 1752 * 16], m3
29633
29634 ; mode 30 [row 8 -first half]
29635 movu [r0 + 1808 * 16], m3
29636
29637 pmaddubsw m3, m1, m6
29638 pmulhrsw m3, m7
29639 pmaddubsw m5, m4, m6
29640 pmulhrsw m5, m7
29641 packuswb m3, m5
29642 movu [r0 + 1753 * 16], m3
29643
29644 ; mode 30 [row 8 -second half]
29645 movu [r0 + 1809 * 16], m3
29646
29647 ; mode 29 [row 13]
29648 movu m6, [r5 + 30 * 16]
29649 pmaddubsw m3, m0, m6
29650 pmulhrsw m3, m7
29651 pmaddubsw m5, m2, m6
29652 pmulhrsw m5, m7
29653 packuswb m3, m5
29654 movu [r0 + 1754 * 16], m3
29655
29656 ; mode 32 [row 5 - first half]
29657 movu [r0 + 1930 * 16], m3
29658
29659 pmaddubsw m3, m1, m6
29660 pmulhrsw m3, m7
29661 pmaddubsw m5, m4, m6
29662 pmulhrsw m5, m7
29663 packuswb m3, m5
29664 movu [r0 + 1755 * 16], m3
29665
29666 ; mode 32 [row 5 - second half]
29667 movu [r0 + 1931 * 16], m3
29668
29669 ; mode 30 [row 7]
29670 movu m6, [r5 + 8 * 16]
29671 pmaddubsw m3, m0, m6
29672 pmulhrsw m3, m7
29673 pmaddubsw m5, m2, m6
29674 pmulhrsw m5, m7
29675 packuswb m3, m5
29676 movu [r0 + 1806 * 16], m3
29677
29678 ; mode 33 [row 3 - first half]
29679 movu [r0 + 1990 * 16], m3
29680
29681 pmaddubsw m3, m1, m6
29682 pmulhrsw m3, m7
29683 pmaddubsw m5, m4, m6
29684 pmulhrsw m5, m7
29685 packuswb m3, m5
29686 movu [r0 + 1807 * 16], m3
29687
29688 ; mode 33 [row 3 - second half]
29689 movu [r0 + 1991 * 16], m3
29690
29691 ; mode 31 [row 5]
29692 movu m6, [r5 + 6 * 16]
29693 pmaddubsw m3, m0, m6
29694 pmulhrsw m3, m7
29695 pmaddubsw m5, m2, m6
29696 pmulhrsw m5, m7
29697 packuswb m3, m5
29698 movu [r0 + 1866 * 16], m3
29699 pmaddubsw m3, m1, m6
29700 pmulhrsw m3, m7
29701 pmaddubsw m5, m4, m6
29702 pmulhrsw m5, m7
29703 packuswb m3, m5
29704 movu [r0 + 1867 * 16], m3
29705
29706 ; mode 31 [row 6]
29707 movu m6, [r5 + 23 * 16]
29708 pmaddubsw m3, m0, m6
29709 pmulhrsw m3, m7
29710 pmaddubsw m5, m2, m6
29711 pmulhrsw m5, m7
29712 packuswb m3, m5
29713 movu [r0 + 1868 * 16], m3
29714 pmaddubsw m3, m1, m6
29715 pmulhrsw m3, m7
29716 pmaddubsw m5, m4, m6
29717 pmulhrsw m5, m7
29718 packuswb m3, m5
29719 movu [r0 + 1869 * 16], m3
29720
29721 ; mode 28 [row 25]
29722 movu m6, [r5 + 2 * 16]
29723 movu m0, [r3 + 5]
29724 movd m1, [r3 + 6]
29725 palignr m1, m0, 1
29726 punpcklbw m0, m1
29727 pmaddubsw m3, m0, m6
29728 pmulhrsw m3, m7
29729 movu m2, [r3 + 13]
29730 movd m4, [r3 + 14]
29731 palignr m4, m2, 1
29732 punpcklbw m2, m4
29733 pmaddubsw m5, m2, m6
29734 pmulhrsw m5, m7
29735 packuswb m3, m5
29736 movu [r0 + 1714 * 16], m3
29737
29738 movu m1, [r3 + 21]
29739 movd m3, [r3 + 22]
29740 palignr m3, m1, 1
29741 punpcklbw m1, m3
29742 pmaddubsw m3, m1, m6
29743 pmulhrsw m3, m7
29744 movu m4, [r3 + 29]
29745 movd m5, [r3 + 30]
29746 palignr m5, m4, 1
29747 punpcklbw m4, m5
29748 pmaddubsw m5, m4, m6
29749 pmulhrsw m5, m7
29750 packuswb m3, m5
29751 movu [r0 + 1715 * 16], m3
29752
29753 ; mode 28 [row 26]
29754 movu m6, [r5 + 7 * 16]
29755 pmaddubsw m3, m0, m6
29756 pmulhrsw m3, m7
29757 pmaddubsw m5, m2, m6
29758 pmulhrsw m5, m7
29759 packuswb m3, m5
29760 movu [r0 + 1716 * 16], m3
29761
29762 ; mode 29 [row 14 - first half]
29763 movu [r0 + 1756 * 16], m3
29764
29765 pmaddubsw m3, m1, m6
29766 pmulhrsw m3, m7
29767 pmaddubsw m5, m4, m6
29768 pmulhrsw m5, m7
29769 packuswb m3, m5
29770 movu [r0 + 1717 * 16], m3
29771
29772 ; mode 29 [row 14 - second half]
29773 movu [r0 + 1757 * 16], m3
29774
29775 ; mode 28 [row 27]
29776 movu m6, [r5 + 12 * 16]
29777 pmaddubsw m3, m0, m6
29778 pmulhrsw m3, m7
29779 pmaddubsw m5, m2, m6
29780 pmulhrsw m5, m7
29781 packuswb m3, m5
29782 movu [r0 + 1718 * 16], m3
29783 pmaddubsw m3, m1, m6
29784 pmulhrsw m3, m7
29785 pmaddubsw m5, m4, m6
29786 pmulhrsw m5, m7
29787 packuswb m3, m5
29788 movu [r0 + 1719 * 16], m3
29789
29790 ; mode 28 [row 28]
29791 movu m6, [r5 + 17 * 16]
29792 pmaddubsw m3, m0, m6
29793 pmulhrsw m3, m7
29794 pmaddubsw m5, m2, m6
29795 pmulhrsw m5, m7
29796 packuswb m3, m5
29797 movu [r0 + 1720 * 16], m3
29798 pmaddubsw m3, m1, m6
29799 pmulhrsw m3, m7
29800 pmaddubsw m5, m4, m6
29801 pmulhrsw m5, m7
29802 packuswb m3, m5
29803 movu [r0 + 1721 * 16], m3
29804
29805 ; mode 28 [row 29]
29806 movu m6, [r5 + 22 * 16]
29807 pmaddubsw m3, m0, m6
29808 pmulhrsw m3, m7
29809 pmaddubsw m5, m2, m6
29810 pmulhrsw m5, m7
29811 packuswb m3, m5
29812 movu [r0 + 1722 * 16], m3
29813 pmaddubsw m3, m1, m6
29814 pmulhrsw m3, m7
29815 pmaddubsw m5, m4, m6
29816 pmulhrsw m5, m7
29817 packuswb m3, m5
29818 movu [r0 + 1723 * 16], m3
29819
29820 ; mode 28 [row 30]
29821 movu m6, [r5 + 27 * 16]
29822 pmaddubsw m3, m0, m6
29823 pmulhrsw m3, m7
29824 pmaddubsw m5, m2, m6
29825 pmulhrsw m5, m7
29826 packuswb m3, m5
29827 movu [r0 + 1724 * 16], m3
29828 pmaddubsw m3, m1, m6
29829 pmulhrsw m3, m7
29830 pmaddubsw m5, m4, m6
29831 pmulhrsw m5, m7
29832 packuswb m3, m5
29833 movu [r0 + 1725 * 16], m3
29834
29835 ; mode 29 [row 15]
29836 movu m6, [r5 + 16 * 16]
29837 pmaddubsw m3, m0, m6
29838 pmulhrsw m3, m7
29839 pmaddubsw m5, m2, m6
29840 pmulhrsw m5, m7
29841 packuswb m3, m5
29842 movu [r0 + 1758 * 16], m3
29843 pmaddubsw m3, m1, m6
29844 pmulhrsw m3, m7
29845 pmaddubsw m5, m4, m6
29846 pmulhrsw m5, m7
29847 packuswb m3, m5
29848 movu [r0 + 1759 * 16], m3
29849
29850 ; mode 29 [row 16]
29851 movu m6, [r5 + 25 * 16]
29852 pmaddubsw m3, m0, m6
29853 pmulhrsw m3, m7
29854 pmaddubsw m5, m2, m6
29855 pmulhrsw m5, m7
29856 packuswb m3, m5
29857 movu [r0 + 1760 * 16], m3
29858 pmaddubsw m3, m1, m6
29859 pmulhrsw m3, m7
29860 pmaddubsw m5, m4, m6
29861 pmulhrsw m5, m7
29862 packuswb m3, m5
29863 movu [r0 + 1761 * 16], m3
29864
29865 ; mode 30 [row 9]
29866 movu m6, [r5 + 2 * 16]
29867 pmaddubsw m3, m0, m6
29868 pmulhrsw m3, m7
29869 pmaddubsw m5, m2, m6
29870 pmulhrsw m5, m7
29871 packuswb m3, m5
29872 movu [r0 + 1810 * 16], m3
29873
29874 ; mode 33 [row 4 - first half]
29875 movu [r0 + 1992 * 16], m3
29876
29877 pmaddubsw m3, m1, m6
29878 pmulhrsw m3, m7
29879 pmaddubsw m5, m4, m6
29880 pmulhrsw m5, m7
29881 packuswb m3, m5
29882 movu [r0 + 1811 * 16], m3
29883
29884 ; mode 33 [row 4 - second half]
29885 movu [r0 + 1993 * 16], m3
29886
29887 ; mode 30 [row 10]
29888 movu m6, [r5 + 15 * 16]
29889 pmaddubsw m3, m0, m6
29890 pmulhrsw m3, m7
29891 pmaddubsw m5, m2, m6
29892 pmulhrsw m5, m7
29893 packuswb m3, m5
29894 movu [r0 + 1812 * 16], m3
29895 pmaddubsw m3, m1, m6
29896 pmulhrsw m3, m7
29897 pmaddubsw m5, m4, m6
29898 pmulhrsw m5, m7
29899 packuswb m3, m5
29900 movu [r0 + 1813 * 16], m3
29901
29902 ; mode 31 [row 7]
29903 movu m6, [r5 + 8 * 16]
29904 pmaddubsw m3, m0, m6
29905 pmulhrsw m3, m7
29906 pmaddubsw m5, m2, m6
29907 pmulhrsw m5, m7
29908 packuswb m3, m5
29909 movu [r0 + 1870 * 16], m3
29910 pmaddubsw m3, m1, m6
29911 pmulhrsw m3, m7
29912 pmaddubsw m5, m4, m6
29913 pmulhrsw m5, m7
29914 packuswb m3, m5
29915 movu [r0 + 1871 * 16], m3
29916
29917 ; mode 31 [row 8]
29918 movu m6, [r5 + 25 * 16]
29919 pmaddubsw m3, m0, m6
29920 pmulhrsw m3, m7
29921 pmaddubsw m5, m2, m6
29922 pmulhrsw m5, m7
29923 packuswb m3, m5
29924 movu [r0 + 1872 * 16], m3
29925 pmaddubsw m3, m1, m6
29926 pmulhrsw m3, m7
29927 pmaddubsw m5, m4, m6
29928 pmulhrsw m5, m7
29929 packuswb m3, m5
29930 movu [r0 + 1873 * 16], m3
29931
29932 ; mode 32 [row 6]
29933 movu m6, [r5 + 19 * 16]
29934 pmaddubsw m3, m0, m6
29935 pmulhrsw m3, m7
29936 pmaddubsw m5, m2, m6
29937 pmulhrsw m5, m7
29938 packuswb m3, m5
29939 movu [r0 + 1932 * 16], m3
29940 pmaddubsw m3, m1, m6
29941 pmulhrsw m3, m7
29942 pmaddubsw m5, m4, m6
29943 pmulhrsw m5, m7
29944 packuswb m3, m5
29945 movu [r0 + 1933 * 16], m3
29946
29947 ; mode 30 [row 11]
29948 movu m6, [r5 + 28 * 16]
29949 pmaddubsw m3, m0, m6
29950 pmulhrsw m3, m7
29951 pmaddubsw m5, m2, m6
29952 pmulhrsw m5, m7
29953 packuswb m3, m5
29954 movu [r0 + 1814 * 16], m3
29955
29956 ; mode 33 [row 5 - first half]
29957 movu [r0 + 1994 * 16], m3
29958
29959 pmaddubsw m3, m1, m6
29960 pmulhrsw m3, m7
29961 pmaddubsw m5, m4, m6
29962 pmulhrsw m5, m7
29963 packuswb m3, m5
29964 movu [r0 + 1815 * 16], m3
29965
29966 ; mode 33 [row 5 - second half]
29967 movu [r0 + 1995 * 16], m3
29968
29969 ; mode 28 [row 31]
29970 movu m0, [r3 + 6]
29971 movd m1, [r3 + 7]
29972 palignr m1, m0, 1
29973 punpcklbw m0, m1
29974 movu m2, [r3 + 14]
29975 movd m3, [r3 + 15]
29976 palignr m3, m2, 1
29977 punpcklbw m2, m3
29978 movu m1, [r3 + 22]
29979 movd m3, [r3 + 23]
29980 palignr m3, m1, 1
29981 punpcklbw m1, m3
29982 movu m4, [r3 + 30]
29983 movd m5, [r3 + 31]
29984 palignr m5, m4, 1
29985 punpcklbw m4, m5
29986
29987 pshufb m5, m0, [tab_S2]
29988 movh [r0 + 1726 * 16], m5
29989 pshufb m5, m2, [tab_S2]
29990 movh [r0 + 1726 * 16 + 8], m5
29991 pshufb m5, m1, [tab_S2]
29992 movh [r0 + 1727 * 16], m5
29993 pshufb m5, m4, [tab_S2]
29994 movh [r0 + 1727 * 16 + 8], m5
29995
29996 ; mode 29 [row 17]
29997 movu m6, [r5 + 2 * 16]
29998 pmaddubsw m3, m0, m6
29999 pmulhrsw m3, m7
30000 pmaddubsw m5, m2, m6
30001 pmulhrsw m5, m7
30002 packuswb m3, m5
30003 movu [r0 + 1762 * 16], m3
30004 pmaddubsw m3, m1, m6
30005 pmulhrsw m3, m7
30006 pmaddubsw m5, m4, m6
30007 pmulhrsw m5, m7
30008 packuswb m3, m5
30009 movu [r0 + 1763 * 16], m3
30010
30011 ; mode 29 [row 18]
30012 movu m6, [r5 + 11 * 16]
30013 pmaddubsw m3, m0, m6
30014 pmulhrsw m3, m7
30015 pmaddubsw m5, m2, m6
30016 pmulhrsw m5, m7
30017 packuswb m3, m5
30018 movu [r0 + 1764 * 16], m3
30019 pmaddubsw m3, m1, m6
30020 pmulhrsw m3, m7
30021 pmaddubsw m5, m4, m6
30022 pmulhrsw m5, m7
30023 packuswb m3, m5
30024 movu [r0 + 1765 * 16], m3
30025
30026 ; mode 29 [row 19]
30027 movu m6, [r5 + 20 * 16]
30028 pmaddubsw m3, m0, m6
30029 pmulhrsw m3, m7
30030 pmaddubsw m5, m2, m6
30031 pmulhrsw m5, m7
30032 packuswb m3, m5
30033 movu [r0 + 1766 * 16], m3
30034 pmaddubsw m3, m1, m6
30035 pmulhrsw m3, m7
30036 pmaddubsw m5, m4, m6
30037 pmulhrsw m5, m7
30038 packuswb m3, m5
30039 movu [r0 + 1767 * 16], m3
30040
30041 ; mode 29 [row 20]
30042 movu m6, [r5 + 29 * 16]
30043 pmaddubsw m3, m0, m6
30044 pmulhrsw m3, m7
30045 pmaddubsw m5, m2, m6
30046 pmulhrsw m5, m7
30047 packuswb m3, m5
30048 movu [r0 + 1768 * 16], m3
30049
30050 ; mode 32 [row 8 - first halif]
30051 movu [r0 + 1936 * 16], m3
30052
30053 pmaddubsw m3, m1, m6
30054 pmulhrsw m3, m7
30055 pmaddubsw m5, m4, m6
30056 pmulhrsw m5, m7
30057 packuswb m3, m5
30058 movu [r0 + 1769 * 16], m3
30059
30060 ; mode 32 [row 8 - second halif]
30061 movu [r0 + 1937 * 16], m3
30062
30063 ; mode 30 [row 12]
30064 movu m6, [r5 + 9 * 16]
30065 pmaddubsw m3, m0, m6
30066 pmulhrsw m3, m7
30067 pmaddubsw m5, m2, m6
30068 pmulhrsw m5, m7
30069 packuswb m3, m5
30070 movu [r0 + 1816 * 16], m3
30071 pmaddubsw m3, m1, m6
30072 pmulhrsw m3, m7
30073 pmaddubsw m5, m4, m6
30074 pmulhrsw m5, m7
30075 packuswb m3, m5
30076 movu [r0 + 1817 * 16], m3
30077
30078 ; mode 30 [row 13]
30079 movu m6, [r5 + 22 * 16]
30080 pmaddubsw m3, m0, m6
30081 pmulhrsw m3, m7
30082 pmaddubsw m5, m2, m6
30083 pmulhrsw m5, m7
30084 packuswb m3, m5
30085 movu [r0 + 1818 * 16], m3
30086
30087 ; mode 33 [row 6 - first half]
30088 movu [r0 + 1996 * 16], m3
30089
30090 pmaddubsw m3, m1, m6
30091 pmulhrsw m3, m7
30092 pmaddubsw m5, m4, m6
30093 pmulhrsw m5, m7
30094 packuswb m3, m5
30095 movu [r0 + 1819 * 16], m3
30096
30097 ; mode 33 [row 6 - second half]
30098 movu [r0 + 1997 * 16], m3
30099
30100 ; mode 31 [row 9]
30101 movu m6, [r5 + 10 * 16]
30102 pmaddubsw m3, m0, m6
30103 pmulhrsw m3, m7
30104 pmaddubsw m5, m2, m6
30105 pmulhrsw m5, m7
30106 packuswb m3, m5
30107 movu [r0 + 1874 * 16], m3
30108 pmaddubsw m3, m1, m6
30109 pmulhrsw m3, m7
30110 pmaddubsw m5, m4, m6
30111 pmulhrsw m5, m7
30112 packuswb m3, m5
30113 movu [r0 + 1875 * 16], m3
30114
30115 ; mode 31 [row 10]
30116 movu m6, [r5 + 27 * 16]
30117 pmaddubsw m3, m0, m6
30118 pmulhrsw m3, m7
30119 pmaddubsw m5, m2, m6
30120 pmulhrsw m5, m7
30121 packuswb m3, m5
30122 movu [r0 + 1876 * 16], m3
30123 pmaddubsw m3, m1, m6
30124 pmulhrsw m3, m7
30125 pmaddubsw m5, m4, m6
30126 pmulhrsw m5, m7
30127 packuswb m3, m5
30128 movu [r0 + 1877 * 16], m3
30129
30130 ; mode 32 [row 7]
30131 movu m6, [r5 + 8 * 16]
30132 pmaddubsw m3, m0, m6
30133 pmulhrsw m3, m7
30134 pmaddubsw m5, m2, m6
30135 pmulhrsw m5, m7
30136 packuswb m3, m5
30137 movu [r0 + 1934 * 16], m3
30138 pmaddubsw m3, m1, m6
30139 pmulhrsw m3, m7
30140 pmaddubsw m5, m4, m6
30141 pmulhrsw m5, m7
30142 packuswb m3, m5
30143 movu [r0 + 1935 * 16], m3
30144
30145 ; mode 29 [row 21]
30146 movu m6, [r5 + 6 * 16]
30147 movu m0, [r3 + 7]
30148 movd m1, [r3 + 8]
30149 palignr m1, m0, 1
30150 punpcklbw m0, m1
30151 pmaddubsw m3, m0, m6
30152 pmulhrsw m3, m7
30153 movu m2, [r3 + 15]
30154 movd m4, [r3 + 16]
30155 palignr m4, m2, 1
30156 punpcklbw m2, m4
30157 pmaddubsw m5, m2, m6
30158 pmulhrsw m5, m7
30159 packuswb m3, m5
30160 movu [r0 + 1770 * 16], m3
30161
30162 movu m1, [r3 + 23]
30163 movd m3, [r3 + 24]
30164 palignr m3, m1, 1
30165 punpcklbw m1, m3
30166 pmaddubsw m3, m1, m6
30167 pmulhrsw m3, m7
30168 movu m4, [r3 + 31]
30169 movd m5, [r3 + 32]
30170 palignr m5, m4, 1
30171 punpcklbw m4, m5
30172 pmaddubsw m5, m4, m6
30173 pmulhrsw m5, m7
30174 packuswb m3, m5
30175 movu [r0 + 1771 * 16], m3
30176
30177 ; mode 29 [row 22]
30178 movu m6, [r5 + 15 * 16]
30179 pmaddubsw m3, m0, m6
30180 pmulhrsw m3, m7
30181 pmaddubsw m5, m2, m6
30182 pmulhrsw m5, m7
30183 packuswb m3, m5
30184 movu [r0 + 1772 * 16], m3
30185 pmaddubsw m3, m1, m6
30186 pmulhrsw m3, m7
30187 pmaddubsw m5, m4, m6
30188 pmulhrsw m5, m7
30189 packuswb m3, m5
30190 movu [r0 + 1773 * 16], m3
30191
30192 ; mode 29 [row 23]
30193 movu m6, [r5 + 24 * 16]
30194 pmaddubsw m3, m0, m6
30195 pmulhrsw m3, m7
30196 pmaddubsw m5, m2, m6
30197 pmulhrsw m5, m7
30198 packuswb m3, m5
30199 movu [r0 + 1774 * 16], m3
30200 pmaddubsw m3, m1, m6
30201 pmulhrsw m3, m7
30202 pmaddubsw m5, m4, m6
30203 pmulhrsw m5, m7
30204 packuswb m3, m5
30205 movu [r0 + 1775 * 16], m3
30206
30207 ; mode 30 [row 14]
30208 movu m6, [r5 + 3 * 16]
30209 pmaddubsw m3, m0, m6
30210 pmulhrsw m3, m7
30211 pmaddubsw m5, m2, m6
30212 pmulhrsw m5, m7
30213 packuswb m3, m5
30214 movu [r0 + 1820 * 16], m3
30215 pmaddubsw m3, m1, m6
30216 pmulhrsw m3, m7
30217 pmaddubsw m5, m4, m6
30218 pmulhrsw m5, m7
30219 packuswb m3, m5
30220 movu [r0 + 1821 * 16], m3
30221
30222 ; mode 30 [row 15]
30223 movu m6, [r5 + 16 * 16]
30224 pmaddubsw m3, m0, m6
30225 pmulhrsw m3, m7
30226 pmaddubsw m5, m2, m6
30227 pmulhrsw m5, m7
30228 packuswb m3, m5
30229 movu [r0 + 1822 * 16], m3
30230
30231 ; mode 33 [row 7 - first half]
30232 movu [r0 + 1998 * 16], m3
30233
30234 pmaddubsw m3, m1, m6
30235 pmulhrsw m3, m7
30236 pmaddubsw m5, m4, m6
30237 pmulhrsw m5, m7
30238 packuswb m3, m5
30239 movu [r0 + 1823 * 16], m3
30240
30241 ; mode 33 [row 7 - second half]
30242 movu [r0 + 1999 * 16], m3
30243
30244 ; mode 30 [row 16]
30245 movu m6, [r5 + 29 * 16]
30246 pmaddubsw m3, m0, m6
30247 pmulhrsw m3, m7
30248 pmaddubsw m5, m2, m6
30249 pmulhrsw m5, m7
30250 packuswb m3, m5
30251 movu [r0 + 1824 * 16], m3
30252
30253 ; mode 31 [row 12 - first half]
30254 movu [r0 + 1880 * 16], m3
30255
30256 pmaddubsw m3, m1, m6
30257 pmulhrsw m3, m7
30258 pmaddubsw m5, m4, m6
30259 pmulhrsw m5, m7
30260 packuswb m3, m5
30261 movu [r0 + 1825 * 16], m3
30262
30263 ; mode 31 [row 12 - second half]
30264 movu [r0 + 1881 * 16], m3
30265
30266 ; mode 31 [row 11]
30267 movu m6, [r5 + 12 * 16]
30268 pmaddubsw m3, m0, m6
30269 pmulhrsw m3, m7
30270 pmaddubsw m5, m2, m6
30271 pmulhrsw m5, m7
30272 packuswb m3, m5
30273 movu [r0 + 1878 * 16], m3
30274 pmaddubsw m3, m1, m6
30275 pmulhrsw m3, m7
30276 pmaddubsw m5, m4, m6
30277 pmulhrsw m5, m7
30278 packuswb m3, m5
30279 movu [r0 + 1879 * 16], m3
30280
30281 ; mode 32 [row 9]
30282 movu m6, [r5 + 18 * 16]
30283 pmaddubsw m3, m0, m6
30284 pmulhrsw m3, m7
30285 pmaddubsw m5, m2, m6
30286 pmulhrsw m5, m7
30287 packuswb m3, m5
30288 movu [r0 + 1938 * 16], m3
30289 pmaddubsw m3, m1, m6
30290 pmulhrsw m3, m7
30291 pmaddubsw m5, m4, m6
30292 pmulhrsw m5, m7
30293 packuswb m3, m5
30294 movu [r0 + 1939 * 16], m3
30295
30296 ; mode 29 [row 24]
30297 movu m6, [r5 + 1 * 16]
30298 movu m0, [r3 + 8]
30299 movd m1, [r3 + 9]
30300 palignr m1, m0, 1
30301 punpcklbw m0, m1
30302 pmaddubsw m3, m0, m6
30303 pmulhrsw m3, m7
30304 movu m2, [r3 + 16]
30305 movd m4, [r3 + 17]
30306 palignr m4, m2, 1
30307 punpcklbw m2, m4
30308 pmaddubsw m5, m2, m6
30309 pmulhrsw m5, m7
30310 packuswb m3, m5
30311 movu [r0 + 1776 * 16], m3
30312
30313 movu m1, [r3 + 24]
30314 movd m3, [r3 + 25]
30315 palignr m3, m1, 1
30316 punpcklbw m1, m3
30317 pmaddubsw m3, m1, m6
30318 pmulhrsw m3, m7
30319 movu m4, [r3 + 32]
30320 movd m5, [r3 + 33]
30321 palignr m5, m4, 1
30322 punpcklbw m4, m5
30323 pmaddubsw m5, m4, m6
30324 pmulhrsw m5, m7
30325 packuswb m3, m5
30326 movu [r0 + 1777 * 16], m3
30327
30328 ; mode 29 [row 25]
30329 movu m6, [r5 + 10 * 16]
30330 pmaddubsw m3, m0, m6
30331 pmulhrsw m3, m7
30332 pmaddubsw m5, m2, m6
30333 pmulhrsw m5, m7
30334 packuswb m3, m5
30335 movu [r0 + 1778 * 16], m3
30336
30337 ; mode 30 [row 17 - first half]
30338 movu [r0 + 1826 * 16], m3
30339
30340 ; mode 33 [row 8 - first half]
30341 movu [r0 + 2000 * 16], m3
30342
30343 pmaddubsw m3, m1, m6
30344 pmulhrsw m3, m7
30345 pmaddubsw m5, m4, m6
30346 pmulhrsw m5, m7
30347 packuswb m3, m5
30348 movu [r0 + 1779 * 16], m3
30349
30350 ; mode 30 [row 17 - second half]
30351 movu [r0 + 1827 * 16], m3
30352
30353 ; mode 33 [row 8 - second half]
30354 movu [r0 + 2001 * 16], m3
30355
30356 ; mode 29 [row 26]
30357 movu m6, [r5 + 19 * 16]
30358 pmaddubsw m3, m0, m6
30359 pmulhrsw m3, m7
30360 pmaddubsw m5, m2, m6
30361 pmulhrsw m5, m7
30362 packuswb m3, m5
30363 movu [r0 + 1780 * 16], m3
30364 pmaddubsw m3, m1, m6
30365 pmulhrsw m3, m7
30366 pmaddubsw m5, m4, m6
30367 pmulhrsw m5, m7
30368 packuswb m3, m5
30369 movu [r0 + 1781 * 16], m3
30370
30371 ; mode 29 [row 27]
30372 movu m6, [r5 + 28 * 16]
30373 pmaddubsw m3, m0, m6
30374 pmulhrsw m3, m7
30375 pmaddubsw m5, m2, m6
30376 pmulhrsw m5, m7
30377 packuswb m3, m5
30378 movu [r0 + 1782 * 16], m3
30379
30380 ; mode 32 [row 11 - first half]
30381 movu [r0 + 1942 * 16], m3
30382
30383 pmaddubsw m3, m1, m6
30384 pmulhrsw m3, m7
30385 pmaddubsw m5, m4, m6
30386 pmulhrsw m5, m7
30387 packuswb m3, m5
30388 movu [r0 + 1783 * 16], m3
30389
30390 ; mode 32 [row 11 - second half]
30391 movu [r0 + 1943 * 16], m3
30392
30393 ; mode 30 [row 18]
30394 movu m6, [r5 + 23 * 16]
30395 pmaddubsw m3, m0, m6
30396 pmulhrsw m3, m7
30397 pmaddubsw m5, m2, m6
30398 pmulhrsw m5, m7
30399 packuswb m3, m5
30400 movu [r0 + 1828 * 16], m3
30401 pmaddubsw m3, m1, m6
30402 pmulhrsw m3, m7
30403 pmaddubsw m5, m4, m6
30404 pmulhrsw m5, m7
30405 packuswb m3, m5
30406 movu [r0 + 1829 * 16], m3
30407
30408 ; mode 31 [row 13]
30409 movu m6, [r5 + 14 * 16]
30410 pmaddubsw m3, m0, m6
30411 pmulhrsw m3, m7
30412 pmaddubsw m5, m2, m6
30413 pmulhrsw m5, m7
30414 packuswb m3, m5
30415 movu [r0 + 1882 * 16], m3
30416 pmaddubsw m3, m1, m6
30417 pmulhrsw m3, m7
30418 pmaddubsw m5, m4, m6
30419 pmulhrsw m5, m7
30420 packuswb m3, m5
30421 movu [r0 + 1883 * 16], m3
30422
30423 ; mode 31 [row 14]
30424 movu m6, [r5 + 31 * 16]
30425 pmaddubsw m3, m0, m6
30426 pmulhrsw m3, m7
30427 pmaddubsw m5, m2, m6
30428 pmulhrsw m5, m7
30429 packuswb m3, m5
30430 movu [r0 + 1884 * 16], m3
30431 pmaddubsw m3, m1, m6
30432 pmulhrsw m3, m7
30433 pmaddubsw m5, m4, m6
30434 pmulhrsw m5, m7
30435 packuswb m3, m5
30436 movu [r0 + 1885 * 16], m3
30437
30438 ; mode 32 [row 10]
30439 movu m6, [r5 + 7 * 16]
30440 pmaddubsw m3, m0, m6
30441 pmulhrsw m3, m7
30442 pmaddubsw m5, m2, m6
30443 pmulhrsw m5, m7
30444 packuswb m3, m5
30445 movu [r0 + 1940 * 16], m3
30446 pmaddubsw m3, m1, m6
30447 pmulhrsw m3, m7
30448 pmaddubsw m5, m4, m6
30449 pmulhrsw m5, m7
30450 packuswb m3, m5
30451 movu [r0 + 1941 * 16], m3
30452
30453 ; mode 29 [row 28]
30454 movu m6, [r5 + 5 * 16]
30455 movu m0, [r3 + 9]
30456 movd m1, [r3 + 10]
30457 palignr m1, m0, 1
30458 punpcklbw m0, m1
30459 pmaddubsw m3, m0, m6
30460 pmulhrsw m3, m7
30461 movu m2, [r3 + 17]
30462 movd m4, [r3 + 18]
30463 palignr m4, m2, 1
30464 punpcklbw m2, m4
30465 pmaddubsw m5, m2, m6
30466 pmulhrsw m5, m7
30467 packuswb m3, m5
30468 movu [r0 + 1784 * 16], m3
30469
30470 movu m1, [r3 + 25]
30471 movd m3, [r3 + 26]
30472 palignr m3, m1, 1
30473 punpcklbw m1, m3
30474 pmaddubsw m3, m1, m6
30475 pmulhrsw m3, m7
30476 movu m4, [r3 + 33]
30477 movd m5, [r3 + 34]
30478 palignr m5, m4, 1
30479 punpcklbw m4, m5
30480 pmaddubsw m5, m4, m6
30481 pmulhrsw m5, m7
30482 packuswb m3, m5
30483 movu [r0 + 1785 * 16], m3
30484
30485 ; mode 29 [row 29]
30486 movu m6, [r5 + 14 * 16]
30487 pmaddubsw m3, m0, m6
30488 pmulhrsw m3, m7
30489 pmaddubsw m5, m2, m6
30490 pmulhrsw m5, m7
30491 packuswb m3, m5
30492 movu [r0 + 1786 * 16], m3
30493 pmaddubsw m3, m1, m6
30494 pmulhrsw m3, m7
30495 pmaddubsw m5, m4, m6
30496 pmulhrsw m5, m7
30497 packuswb m3, m5
30498 movu [r0 + 1787 * 16], m3
30499
30500 ; mode 29 [row 30]
30501 movu m6, [r5 + 23 * 16]
30502 pmaddubsw m3, m0, m6
30503 pmulhrsw m3, m7
30504 pmaddubsw m5, m2, m6
30505 pmulhrsw m5, m7
30506 packuswb m3, m5
30507 movu [r0 + 1788 * 16], m3
30508 pmaddubsw m3, m1, m6
30509 pmulhrsw m3, m7
30510 pmaddubsw m5, m4, m6
30511 pmulhrsw m5, m7
30512 packuswb m3, m5
30513 movu [r0 + 1789 * 16], m3
30514
30515 ; mode 30 [row 19]
30516 movu m6, [r5 + 4 * 16]
30517 pmaddubsw m3, m0, m6
30518 pmulhrsw m3, m7
30519 pmaddubsw m5, m2, m6
30520 pmulhrsw m5, m7
30521 packuswb m3, m5
30522 movu [r0 + 1830 * 16], m3
30523
30524 ; mode 33 [row 9 - first half]
30525 movu [r0 + 2002 * 16], m3
30526
30527 pmaddubsw m3, m1, m6
30528 pmulhrsw m3, m7
30529 pmaddubsw m5, m4, m6
30530 pmulhrsw m5, m7
30531 packuswb m3, m5
30532 movu [r0 + 1831 * 16], m3
30533
30534 ; mode 33 [row 9 - second half]
30535 movu [r0 + 2003 * 16], m3
30536
30537 ; mode 30 [row 20]
30538 movu m6, [r5 + 17 * 16]
30539 pmaddubsw m3, m0, m6
30540 pmulhrsw m3, m7
30541 pmaddubsw m5, m2, m6
30542 pmulhrsw m5, m7
30543 packuswb m3, m5
30544 movu [r0 + 1832 * 16], m3
30545
30546 ; mode 32 [row 12 - first half]
30547 movu [r0 + 1944 * 16], m3
30548
30549 pmaddubsw m3, m1, m6
30550 pmulhrsw m3, m7
30551 pmaddubsw m5, m4, m6
30552 pmulhrsw m5, m7
30553 packuswb m3, m5
30554 movu [r0 + 1833 * 16], m3
30555
30556 ; mode 32 [row 12 - second half]
30557 movu [r0 + 1945 * 16], m3
30558
30559 ; mode 30 [row 21]
30560 movu m6, [r5 + 30 * 16]
30561 pmaddubsw m3, m0, m6
30562 pmulhrsw m3, m7
30563 pmaddubsw m5, m2, m6
30564 pmulhrsw m5, m7
30565 packuswb m3, m5
30566 movu [r0 + 1834 * 16], m3
30567
30568 ; mode 33 [row 10 - first half]
30569 movu [r0 + 2004 * 16], m3
30570
30571 pmaddubsw m3, m1, m6
30572 pmulhrsw m3, m7
30573 pmaddubsw m5, m4, m6
30574 pmulhrsw m5, m7
30575 packuswb m3, m5
30576 movu [r0 + 1835 * 16], m3
30577
30578 ; mode 33 [row 10 - second half]
30579 movu [r0 + 2005 * 16], m3
30580
30581 ; mode 31 [row 15]
30582 movu m6, [r5 + 16 * 16]
30583 pmaddubsw m3, m0, m6
30584 pmulhrsw m3, m7
30585 pmaddubsw m5, m2, m6
30586 pmulhrsw m5, m7
30587 packuswb m3, m5
30588 movu [r0 + 1886 * 16], m3
30589 pmaddubsw m3, m1, m6
30590 pmulhrsw m3, m7
30591 pmaddubsw m5, m4, m6
30592 pmulhrsw m5, m7
30593 packuswb m3, m5
30594 movu [r0 + 1887 * 16], m3
30595
30596 ; mode 29 [row 31]
30597 movu m0, [r3 + 10]
30598 movd m1, [r3 + 11]
30599 palignr m1, m0, 1
30600 punpcklbw m0, m1
30601 movu m2, [r3 + 18]
30602 movd m3, [r3 + 19]
30603 palignr m3, m2, 1
30604 punpcklbw m2, m3
30605 movu m1, [r3 + 26]
30606 movd m3, [r3 + 27]
30607 palignr m3, m1, 1
30608 punpcklbw m1, m3
30609 movu m4, [r3 + 34]
30610 movd m5, [r3 + 35]
30611 palignr m5, m4, 1
30612 punpcklbw m4, m5
30613
30614 pshufb m5, m0, [tab_S2]
30615 movh [r0 + 1790 * 16], m5
30616 pshufb m5, m2, [tab_S2]
30617 movh [r0 + 1790 * 16 + 8], m5
30618 pshufb m5, m1, [tab_S2]
30619 movh [r0 + 1791 * 16], m5
30620 pshufb m5, m4, [tab_S2]
30621 movh [r0 + 1791 * 16 + 8], m5
30622
30623 ; mode 30 [row 22]
30624 movu m6, [r5 + 11 * 16]
30625 pmaddubsw m3, m0, m6
30626 pmulhrsw m3, m7
30627 pmaddubsw m5, m2, m6
30628 pmulhrsw m5, m7
30629 packuswb m3, m5
30630 movu [r0 + 1836 * 16], m3
30631 pmaddubsw m3, m1, m6
30632 pmulhrsw m3, m7
30633 pmaddubsw m5, m4, m6
30634 pmulhrsw m5, m7
30635 packuswb m3, m5
30636 movu [r0 + 1837 * 16], m3
30637
30638 ; mode 30 [row 23]
30639 movu m6, [r5 + 24 * 16]
30640 pmaddubsw m3, m0, m6
30641 pmulhrsw m3, m7
30642 pmaddubsw m5, m2, m6
30643 pmulhrsw m5, m7
30644 packuswb m3, m5
30645 movu [r0 + 1838 * 16], m3
30646
30647 ; mode 33 [row 11 - first half]
30648 movu [r0 + 2006 * 16], m3
30649
30650 pmaddubsw m3, m1, m6
30651 pmulhrsw m3, m7
30652 pmaddubsw m5, m4, m6
30653 pmulhrsw m5, m7
30654 packuswb m3, m5
30655 movu [r0 + 1839 * 16], m3
30656
30657 ; mode 33 [row 11 - second half]
30658 movu [r0 + 2007 * 16], m3
30659
30660 ; mode 31 [row 16]
30661 movu m6, [r5 + 1 * 16]
30662 pmaddubsw m3, m0, m6
30663 pmulhrsw m3, m7
30664 pmaddubsw m5, m2, m6
30665 pmulhrsw m5, m7
30666 packuswb m3, m5
30667 movu [r0 + 1888 * 16], m3
30668 pmaddubsw m3, m1, m6
30669 pmulhrsw m3, m7
30670 pmaddubsw m5, m4, m6
30671 pmulhrsw m5, m7
30672 packuswb m3, m5
30673 movu [r0 + 1889 * 16], m3
30674
30675 ; mode 31 [row 17]
30676 movu m6, [r5 + 18 * 16]
30677 pmaddubsw m3, m0, m6
30678 pmulhrsw m3, m7
30679 pmaddubsw m5, m2, m6
30680 pmulhrsw m5, m7
30681 packuswb m3, m5
30682 movu [r0 + 1890 * 16], m3
30683 pmaddubsw m3, m1, m6
30684 pmulhrsw m3, m7
30685 pmaddubsw m5, m4, m6
30686 pmulhrsw m5, m7
30687 packuswb m3, m5
30688 movu [r0 + 1891 * 16], m3
30689
30690 ; mode 32 [row 13]
30691 movu m6, [r5 + 6 * 16]
30692 pmaddubsw m3, m0, m6
30693 pmulhrsw m3, m7
30694 pmaddubsw m5, m2, m6
30695 pmulhrsw m5, m7
30696 packuswb m3, m5
30697 movu [r0 + 1946 * 16], m3
30698 pmaddubsw m3, m1, m6
30699 pmulhrsw m3, m7
30700 pmaddubsw m5, m4, m6
30701 pmulhrsw m5, m7
30702 packuswb m3, m5
30703 movu [r0 + 1947 * 16], m3
30704
30705 ; mode 32 [row 14]
30706 movu m6, [r5 + 27 * 16]
30707 pmaddubsw m3, m0, m6
30708 pmulhrsw m3, m7
30709 pmaddubsw m5, m2, m6
30710 pmulhrsw m5, m7
30711 packuswb m3, m5
30712 movu [r0 + 1948 * 16], m3
30713 pmaddubsw m3, m1, m6
30714 pmulhrsw m3, m7
30715 pmaddubsw m5, m4, m6
30716 pmulhrsw m5, m7
30717 packuswb m3, m5
30718 movu [r0 + 1949 * 16], m3
30719
30720 ; mode 30 [row 24]
30721 movu m6, [r5 + 5 * 16]
30722 movu m0, [r3 + 11]
30723 movd m1, [r3 + 12]
30724 palignr m1, m0, 1
30725 punpcklbw m0, m1
30726 pmaddubsw m3, m0, m6
30727 pmulhrsw m3, m7
30728 movu m2, [r3 + 19]
30729 movd m4, [r3 + 20]
30730 palignr m4, m2, 1
30731 punpcklbw m2, m4
30732 pmaddubsw m5, m2, m6
30733 pmulhrsw m5, m7
30734 packuswb m3, m5
30735 movu [r0 + 1840 * 16], m3
30736
30737 movu m1, [r3 + 27]
30738 movd m3, [r3 + 28]
30739 palignr m3, m1, 1
30740 punpcklbw m1, m3
30741 pmaddubsw m3, m1, m6
30742 pmulhrsw m3, m7
30743 movu m4, [r3 + 35]
30744 movd m5, [r3 + 36]
30745 palignr m5, m4, 1
30746 punpcklbw m4, m5
30747 pmaddubsw m5, m4, m6
30748 pmulhrsw m5, m7
30749 packuswb m3, m5
30750 movu [r0 + 1841 * 16], m3
30751
30752 ; mode 30 [row 25]
30753 movu m6, [r5 + 18 * 16]
30754 pmaddubsw m3, m0, m6
30755 pmulhrsw m3, m7
30756 pmaddubsw m5, m2, m6
30757 pmulhrsw m5, m7
30758 packuswb m3, m5
30759 movu [r0 + 1842 * 16], m3
30760
30761 ; mode 33 [row 12 - first half]
30762 movu [r0 + 2008 * 16], m3
30763
30764 pmaddubsw m3, m1, m6
30765 pmulhrsw m3, m7
30766 pmaddubsw m5, m4, m6
30767 pmulhrsw m5, m7
30768 packuswb m3, m5
30769 movu [r0 + 1843 * 16], m3
30770
30771 ; mode 33 [row 12 - second half]
30772 movu [r0 + 2009 * 16], m3
30773
30774 ; mode 30 [row 26]
30775 movu m6, [r5 + 31 * 16]
30776 pmaddubsw m3, m0, m6
30777 pmulhrsw m3, m7
30778 pmaddubsw m5, m2, m6
30779 pmulhrsw m5, m7
30780 packuswb m3, m5
30781 movu [r0 + 1844 * 16], m3
30782 pmaddubsw m3, m1, m6
30783 pmulhrsw m3, m7
30784 pmaddubsw m5, m4, m6
30785 pmulhrsw m5, m7
30786 packuswb m3, m5
30787 movu [r0 + 1845 * 16], m3
30788
30789 ; mode 31 [row 18]
30790 movu m6, [r5 + 3 * 16]
30791 pmaddubsw m3, m0, m6
30792 pmulhrsw m3, m7
30793 pmaddubsw m5, m2, m6
30794 pmulhrsw m5, m7
30795 packuswb m3, m5
30796 movu [r0 + 1892 * 16], m3
30797 pmaddubsw m3, m1, m6
30798 pmulhrsw m3, m7
30799 pmaddubsw m5, m4, m6
30800 pmulhrsw m5, m7
30801 packuswb m3, m5
30802 movu [r0 + 1893 * 16], m3
30803
30804 ; mode 31 [row 19]
30805 movu m6, [r5 + 20 * 16]
30806 pmaddubsw m3, m0, m6
30807 pmulhrsw m3, m7
30808 pmaddubsw m5, m2, m6
30809 pmulhrsw m5, m7
30810 packuswb m3, m5
30811 movu [r0 + 1894 * 16], m3
30812 pmaddubsw m3, m1, m6
30813 pmulhrsw m3, m7
30814 pmaddubsw m5, m4, m6
30815 pmulhrsw m5, m7
30816 packuswb m3, m5
30817 movu [r0 + 1895 * 16], m3
30818
30819 ; mode 32 [row 15]
30820 movu m6, [r5 + 16 * 16]
30821 pmaddubsw m3, m0, m6
30822 pmulhrsw m3, m7
30823 pmaddubsw m5, m2, m6
30824 pmulhrsw m5, m7
30825 packuswb m3, m5
30826 movu [r0 + 1950 * 16], m3
30827 pmaddubsw m3, m1, m6
30828 pmulhrsw m3, m7
30829 pmaddubsw m5, m4, m6
30830 pmulhrsw m5, m7
30831 packuswb m3, m5
30832 movu [r0 + 1951 * 16], m3
30833
30834 ; mode 30 [row 27]
30835 movu m6, [r5 + 12 * 16]
30836 movu m0, [r3 + 12]
30837 movd m1, [r3 + 13]
30838 palignr m1, m0, 1
30839 punpcklbw m0, m1
30840 pmaddubsw m3, m0, m6
30841 pmulhrsw m3, m7
30842 movu m2, [r3 + 20]
30843 movd m4, [r3 + 21]
30844 palignr m4, m2, 1
30845 punpcklbw m2, m4
30846 pmaddubsw m5, m2, m6
30847 pmulhrsw m5, m7
30848 packuswb m3, m5
30849 movu [r0 + 1846 * 16], m3
30850
30851 ; mode 33 [row 13 - first half]
30852 movu [r0 + 2010 * 16], m3
30853
30854 movu m1, [r3 + 28]
30855 movd m3, [r3 + 29]
30856 palignr m3, m1, 1
30857 punpcklbw m1, m3
30858 pmaddubsw m3, m1, m6
30859 pmulhrsw m3, m7
30860 movu m4, [r3 + 36]
30861 movd m5, [r3 + 37]
30862 palignr m5, m4, 1
30863 punpcklbw m4, m5
30864 pmaddubsw m5, m4, m6
30865 pmulhrsw m5, m7
30866 packuswb m3, m5
30867 movu [r0 + 1847 * 16], m3
30868
30869 ; mode 33 [row 13 - second half]
30870 movu [r0 + 2011 * 16], m3
30871
30872 ; mode 30 [row 28]
30873 movu m6, [r5 + 25 * 16]
30874 pmaddubsw m3, m0, m6
30875 pmulhrsw m3, m7
30876 pmaddubsw m5, m2, m6
30877 pmulhrsw m5, m7
30878 packuswb m3, m5
30879 movu [r0 + 1848 * 16], m3
30880 pmaddubsw m3, m1, m6
30881 pmulhrsw m3, m7
30882 pmaddubsw m5, m4, m6
30883 pmulhrsw m5, m7
30884 packuswb m3, m5
30885 movu [r0 + 1849 * 16], m3
30886
30887 ; mode 31 [row 20]
30888 movu m6, [r5 + 5 * 16]
30889 pmaddubsw m3, m0, m6
30890 pmulhrsw m3, m7
30891 pmaddubsw m5, m2, m6
30892 pmulhrsw m5, m7
30893 packuswb m3, m5
30894 movu [r0 + 1896 * 16], m3
30895
30896 ; mode 32 [row 16 - first half]
30897 movu [r0 + 1952 * 16], m3
30898
30899 pmaddubsw m3, m1, m6
30900 pmulhrsw m3, m7
30901 pmaddubsw m5, m4, m6
30902 pmulhrsw m5, m7
30903 packuswb m3, m5
30904 movu [r0 + 1897 * 16], m3
30905
30906 ; mode 32 [row 16 - second half]
30907 movu [r0 + 1953 * 16], m3
30908
30909 ; mode 31 [row 21]
30910 movu m6, [r5 + 22 * 16]
30911 pmaddubsw m3, m0, m6
30912 pmulhrsw m3, m7
30913 pmaddubsw m5, m2, m6
30914 pmulhrsw m5, m7
30915 packuswb m3, m5
30916 movu [r0 + 1898 * 16], m3
30917 pmaddubsw m3, m1, m6
30918 pmulhrsw m3, m7
30919 pmaddubsw m5, m4, m6
30920 pmulhrsw m5, m7
30921 packuswb m3, m5
30922 movu [r0 + 1899 * 16], m3
30923
30924 ; mode 32 [row 17]
30925 movu m6, [r5 + 26 * 16]
30926 pmaddubsw m3, m0, m6
30927 pmulhrsw m3, m7
30928 pmaddubsw m5, m2, m6
30929 pmulhrsw m5, m7
30930 packuswb m3, m5
30931 movu [r0 + 1954 * 16], m3
30932 pmaddubsw m3, m1, m6
30933 pmulhrsw m3, m7
30934 pmaddubsw m5, m4, m6
30935 pmulhrsw m5, m7
30936 packuswb m3, m5
30937 movu [r0 + 1955 * 16], m3
30938
30939 ; mode 30 [row 29]
30940 movu m6, [r5 + 6 * 16]
30941 movu m0, [r3 + 13]
30942 movd m1, [r3 + 14]
30943 palignr m1, m0, 1
30944 punpcklbw m0, m1
30945 pmaddubsw m3, m0, m6
30946 pmulhrsw m3, m7
30947 movu m2, [r3 + 21]
30948 movd m4, [r3 + 22]
30949 palignr m4, m2, 1
30950 punpcklbw m2, m4
30951 pmaddubsw m5, m2, m6
30952 pmulhrsw m5, m7
30953 packuswb m3, m5
30954 movu [r0 + 1850 * 16], m3
30955
30956 ; mode 33 [row 14 - first half]
30957 movu [r0 + 2012 * 16], m3
30958
30959 movu m1, [r3 + 29]
30960 movd m3, [r3 + 30]
30961 palignr m3, m1, 1
30962 punpcklbw m1, m3
30963 pmaddubsw m3, m1, m6
30964 pmulhrsw m3, m7
30965 movu m4, [r3 + 37]
30966 movd m5, [r3 + 38]
30967 palignr m5, m4, 1
30968 punpcklbw m4, m5
30969 pmaddubsw m5, m4, m6
30970 pmulhrsw m5, m7
30971 packuswb m3, m5
30972 movu [r0 + 1851 * 16], m3
30973
30974 ; mode 33 [row 14 - second half]
30975 movu [r0 + 2013 * 16], m3
30976
30977 ; mode 30 [row 30]
30978 movu m6, [r5 + 19 * 16]
30979 pmaddubsw m3, m0, m6
30980 pmulhrsw m3, m7
30981 pmaddubsw m5, m2, m6
30982 pmulhrsw m5, m7
30983 packuswb m3, m5
30984 movu [r0 + 1852 * 16], m3
30985 pmaddubsw m3, m1, m6
30986 pmulhrsw m3, m7
30987 pmaddubsw m5, m4, m6
30988 pmulhrsw m5, m7
30989 packuswb m3, m5
30990 movu [r0 + 1853 * 16], m3
30991
30992 ; mode 31 [row 22]
30993 movu m6, [r5 + 7 * 16]
30994 pmaddubsw m3, m0, m6
30995 pmulhrsw m3, m7
30996 pmaddubsw m5, m2, m6
30997 pmulhrsw m5, m7
30998 packuswb m3, m5
30999 movu [r0 + 1900 * 16], m3
31000 pmaddubsw m3, m1, m6
31001 pmulhrsw m3, m7
31002 pmaddubsw m5, m4, m6
31003 pmulhrsw m5, m7
31004 packuswb m3, m5
31005 movu [r0 + 1901 * 16], m3
31006
31007 ; mode 31 [row 23]
31008 movu m6, [r5 + 24 * 16]
31009 pmaddubsw m3, m0, m6
31010 pmulhrsw m3, m7
31011 pmaddubsw m5, m2, m6
31012 pmulhrsw m5, m7
31013 packuswb m3, m5
31014 movu [r0 + 1902 * 16], m3
31015 pmaddubsw m3, m1, m6
31016 pmulhrsw m3, m7
31017 pmaddubsw m5, m4, m6
31018 pmulhrsw m5, m7
31019 packuswb m3, m5
31020 movu [r0 + 1903 * 16], m3
31021
31022 ; mode 32 [row 18]
31023 movu m6, [r5 + 15 * 16]
31024 pmaddubsw m3, m0, m6
31025 pmulhrsw m3, m7
31026 pmaddubsw m5, m2, m6
31027 pmulhrsw m5, m7
31028 packuswb m3, m5
31029 movu [r0 + 1956 * 16], m3
31030 pmaddubsw m3, m1, m6
31031 pmulhrsw m3, m7
31032 pmaddubsw m5, m4, m6
31033 pmulhrsw m5, m7
31034 packuswb m3, m5
31035 movu [r0 + 1957 * 16], m3
31036
31037 ; mode 30 [row 31]
31038 movu m0, [r3 + 14]
31039 movd m1, [r3 + 15]
31040 palignr m1, m0, 1
31041 punpcklbw m0, m1
31042 movu m2, [r3 + 22]
31043 movd m3, [r3 + 23]
31044 palignr m3, m2, 1
31045 punpcklbw m2, m3
31046 movu m1, [r3 + 30]
31047 movd m3, [r3 + 31]
31048 palignr m3, m1, 1
31049 punpcklbw m1, m3
31050 movu m4, [r3 + 38]
31051 movd m5, [r3 + 39]
31052 palignr m5, m4, 1
31053 punpcklbw m4, m5
31054
31055 pshufb m5, m0, [tab_S2]
31056 movh [r0 + 1854 * 16], m5
31057
31058 ; mode 33 [row 15 - first eight]
31059 movh [r0 + 2014 * 16], m5
31060
31061 pshufb m5, m2, [tab_S2]
31062 movh [r0 + 1854 * 16 + 8], m5
31063
31064 ; mode 33 [row 15 - second eight]
31065 movh [r0 + 2014 * 16 + 8], m5
31066
31067 pshufb m5, m1, [tab_S2]
31068 movh [r0 + 1855 * 16], m5
31069
31070 ; mode 33 [row 15 - third eight]
31071 movh [r0 + 2015 * 16], m5
31072
31073 pshufb m5, m4, [tab_S2]
31074 movh [r0 + 1855 * 16 + 8], m5
31075
31076 ; mode 33 [row 15 - fourth eight]
31077 movh [r0 + 2015 * 16 + 8], m5
31078
31079 ; mode 31 [row 24]
31080 movu m6, [r5 + 9 * 16]
31081 pmaddubsw m3, m0, m6
31082 pmulhrsw m3, m7
31083 pmaddubsw m5, m2, m6
31084 pmulhrsw m5, m7
31085 packuswb m3, m5
31086 movu [r0 + 1904 * 16], m3
31087 pmaddubsw m3, m1, m6
31088 pmulhrsw m3, m7
31089 pmaddubsw m5, m4, m6
31090 pmulhrsw m5, m7
31091 packuswb m3, m5
31092 movu [r0 + 1905 * 16], m3
31093
31094 ; mode 31 [row 25]
31095 movu m6, [r5 + 26 * 16]
31096 pmaddubsw m3, m0, m6
31097 pmulhrsw m3, m7
31098 pmaddubsw m5, m2, m6
31099 pmulhrsw m5, m7
31100 packuswb m3, m5
31101 movu [r0 + 1906 * 16], m3
31102
31103 ; mode 33 [row 16 - first half]
31104 movu [r0 + 2016 * 16], m3
31105
31106 pmaddubsw m3, m1, m6
31107 pmulhrsw m3, m7
31108 pmaddubsw m5, m4, m6
31109 pmulhrsw m5, m7
31110 packuswb m3, m5
31111 movu [r0 + 1907 * 16], m3
31112
31113 ; mode 33 [row 16 - second half]
31114 movu [r0 + 2017 * 16], m3
31115
31116 ; mode 32 [row 19]
31117 movu m6, [r5 + 4 * 16]
31118 pmaddubsw m3, m0, m6
31119 pmulhrsw m3, m7
31120 pmaddubsw m5, m2, m6
31121 pmulhrsw m5, m7
31122 packuswb m3, m5
31123 movu [r0 + 1958 * 16], m3
31124 pmaddubsw m3, m1, m6
31125 pmulhrsw m3, m7
31126 pmaddubsw m5, m4, m6
31127 pmulhrsw m5, m7
31128 packuswb m3, m5
31129 movu [r0 + 1959 * 16], m3
31130
31131 ; mode 32 [row 20]
31132 movu m6, [r5 + 25 * 16]
31133 pmaddubsw m3, m0, m6
31134 pmulhrsw m3, m7
31135 pmaddubsw m5, m2, m6
31136 pmulhrsw m5, m7
31137 packuswb m3, m5
31138 movu [r0 + 1960 * 16], m3
31139 pmaddubsw m3, m1, m6
31140 pmulhrsw m3, m7
31141 pmaddubsw m5, m4, m6
31142 pmulhrsw m5, m7
31143 packuswb m3, m5
31144 movu [r0 + 1961 * 16], m3
31145
31146 ; mode 31 [row 26]
31147 movu m6, [r5 + 11 * 16]
31148 movu m0, [r3 + 15]
31149 movd m1, [r3 + 16]
31150 palignr m1, m0, 1
31151 punpcklbw m0, m1
31152 pmaddubsw m3, m0, m6
31153 pmulhrsw m3, m7
31154 movu m2, [r3 + 23]
31155 movd m4, [r3 + 24]
31156 palignr m4, m2, 1
31157 punpcklbw m2, m4
31158 pmaddubsw m5, m2, m6
31159 pmulhrsw m5, m7
31160 packuswb m3, m5
31161 movu [r0 + 1908 * 16], m3
31162
31163 movu m1, [r3 + 31]
31164 movd m3, [r3 + 32]
31165 palignr m3, m1, 1
31166 punpcklbw m1, m3
31167 pmaddubsw m3, m1, m6
31168 pmulhrsw m3, m7
31169 movu m4, [r3 + 39]
31170 movd m5, [r3 + 40]
31171 palignr m5, m4, 1
31172 punpcklbw m4, m5
31173 pmaddubsw m5, m4, m6
31174 pmulhrsw m5, m7
31175 packuswb m3, m5
31176 movu [r0 + 1909 * 16], m3
31177
31178 ; mode 31 [row 27]
31179 movu m6, [r5 + 28 * 16]
31180 pmaddubsw m3, m0, m6
31181 pmulhrsw m3, m7
31182 pmaddubsw m5, m2, m6
31183 pmulhrsw m5, m7
31184 packuswb m3, m5
31185 movu [r0 + 1910 * 16], m3
31186 pmaddubsw m3, m1, m6
31187 pmulhrsw m3, m7
31188 pmaddubsw m5, m4, m6
31189 pmulhrsw m5, m7
31190 packuswb m3, m5
31191 movu [r0 + 1911 * 16], m3
31192
31193 ; mode 32 [row 21]
31194 movu m6, [r5 + 14 * 16]
31195 pmaddubsw m3, m0, m6
31196 pmulhrsw m3, m7
31197 pmaddubsw m5, m2, m6
31198 pmulhrsw m5, m7
31199 packuswb m3, m5
31200 movu [r0 + 1962 * 16], m3
31201 pmaddubsw m3, m1, m6
31202 pmulhrsw m3, m7
31203 pmaddubsw m5, m4, m6
31204 pmulhrsw m5, m7
31205 packuswb m3, m5
31206 movu [r0 + 1963 * 16], m3
31207
31208 ; mode 33 [row 17]
31209 movu m6, [r5 + 20 * 16]
31210 pmaddubsw m3, m0, m6
31211 pmulhrsw m3, m7
31212 pmaddubsw m5, m2, m6
31213 pmulhrsw m5, m7
31214 packuswb m3, m5
31215 movu [r0 + 2018 * 16], m3
31216 pmaddubsw m3, m1, m6
31217 pmulhrsw m3, m7
31218 pmaddubsw m5, m4, m6
31219 pmulhrsw m5, m7
31220 packuswb m3, m5
31221 movu [r0 + 2019 * 16], m3
31222
31223 ; mode 31 [row 28]
31224 movu m6, [r5 + 13 * 16]
31225 movu m0, [r3 + 16]
31226 movd m1, [r3 + 17]
31227 palignr m1, m0, 1
31228 punpcklbw m0, m1
31229 pmaddubsw m3, m0, m6
31230 pmulhrsw m3, m7
31231 movu m2, [r3 + 24]
31232 movd m4, [r3 + 25]
31233 palignr m4, m2, 1
31234 punpcklbw m2, m4
31235 pmaddubsw m5, m2, m6
31236 pmulhrsw m5, m7
31237 packuswb m3, m5
31238 movu [r0 + 1912 * 16], m3
31239
31240 movu m1, [r3 + 32]
31241 movd m3, [r3 + 33]
31242 palignr m3, m1, 1
31243 punpcklbw m1, m3
31244 pmaddubsw m3, m1, m6
31245 pmulhrsw m3, m7
31246 movu m4, [r3 + 40]
31247 movd m5, [r3 + 41]
31248 palignr m5, m4, 1
31249 punpcklbw m4, m5
31250 pmaddubsw m5, m4, m6
31251 pmulhrsw m5, m7
31252 packuswb m3, m5
31253 movu [r0 + 1913 * 16], m3
31254
31255 ; mode 31 [row 29]
31256 movu m6, [r5 + 30 * 16]
31257 pmaddubsw m3, m0, m6
31258 pmulhrsw m3, m7
31259 pmaddubsw m5, m2, m6
31260 pmulhrsw m5, m7
31261 packuswb m3, m5
31262 movu [r0 + 1914 * 16], m3
31263 pmaddubsw m3, m1, m6
31264 pmulhrsw m3, m7
31265 pmaddubsw m5, m4, m6
31266 pmulhrsw m5, m7
31267 packuswb m3, m5
31268 movu [r0 + 1915 * 16], m3
31269
31270 ; mode 32 [row 22]
31271 movu m6, [r5 + 3 * 16]
31272 pmaddubsw m3, m0, m6
31273 pmulhrsw m3, m7
31274 pmaddubsw m5, m2, m6
31275 pmulhrsw m5, m7
31276 packuswb m3, m5
31277 movu [r0 + 1964 * 16], m3
31278 pmaddubsw m3, m1, m6
31279 pmulhrsw m3, m7
31280 pmaddubsw m5, m4, m6
31281 pmulhrsw m5, m7
31282 packuswb m3, m5
31283 movu [r0 + 1965 * 16], m3
31284
31285 ; mode 32 [row 23]
31286 movu m6, [r5 + 24 * 16]
31287 pmaddubsw m3, m0, m6
31288 pmulhrsw m3, m7
31289 pmaddubsw m5, m2, m6
31290 pmulhrsw m5, m7
31291 packuswb m3, m5
31292 movu [r0 + 1966 * 16], m3
31293 pmaddubsw m3, m1, m6
31294 pmulhrsw m3, m7
31295 pmaddubsw m5, m4, m6
31296 pmulhrsw m5, m7
31297 packuswb m3, m5
31298 movu [r0 + 1967 * 16], m3
31299
31300 ; mode 33 [row 18]
31301 movu m6, [r5 + 14 * 16]
31302 pmaddubsw m3, m0, m6
31303 pmulhrsw m3, m7
31304 pmaddubsw m5, m2, m6
31305 pmulhrsw m5, m7
31306 packuswb m3, m5
31307 movu [r0 + 2020 * 16], m3
31308 pmaddubsw m3, m1, m6
31309 pmulhrsw m3, m7
31310 pmaddubsw m5, m4, m6
31311 pmulhrsw m5, m7
31312 packuswb m3, m5
31313 movu [r0 + 2021 * 16], m3
31314
31315 ; mode 31 [row 30]
31316 movu m6, [r5 + 15 * 16]
31317 movu m0, [r3 + 17]
31318 movd m1, [r3 + 18]
31319 palignr m1, m0, 1
31320 punpcklbw m0, m1
31321 pmaddubsw m3, m0, m6
31322 pmulhrsw m3, m7
31323 movu m2, [r3 + 25]
31324 movd m4, [r3 + 26]
31325 palignr m4, m2, 1
31326 punpcklbw m2, m4
31327 pmaddubsw m5, m2, m6
31328 pmulhrsw m5, m7
31329 packuswb m3, m5
31330 movu [r0 + 1916 * 16], m3
31331
31332 movu m1, [r3 + 33]
31333 movd m3, [r3 + 34]
31334 palignr m3, m1, 1
31335 punpcklbw m1, m3
31336 pmaddubsw m3, m1, m6
31337 pmulhrsw m3, m7
31338 movu m4, [r3 + 41]
31339 movd m5, [r3 + 42]
31340 palignr m5, m4, 1
31341 punpcklbw m4, m5
31342 pmaddubsw m5, m4, m6
31343 pmulhrsw m5, m7
31344 packuswb m3, m5
31345 movu [r0 + 1917 * 16], m3
31346
31347 ; mode 32 [row 24]
31348 movu m6, [r5 + 13 * 16]
31349 pmaddubsw m3, m0, m6
31350 pmulhrsw m3, m7
31351 pmaddubsw m5, m2, m6
31352 pmulhrsw m5, m7
31353 packuswb m3, m5
31354 movu [r0 + 1968 * 16], m3
31355 pmaddubsw m3, m1, m6
31356 pmulhrsw m3, m7
31357 pmaddubsw m5, m4, m6
31358 pmulhrsw m5, m7
31359 packuswb m3, m5
31360 movu [r0 + 1969 * 16], m3
31361
31362 ; mode 33 [row 19]
31363 movu m6, [r5 + 8 * 16]
31364 pmaddubsw m3, m0, m6
31365 pmulhrsw m3, m7
31366 pmaddubsw m5, m2, m6
31367 pmulhrsw m5, m7
31368 packuswb m3, m5
31369 movu [r0 + 2022 * 16], m3
31370 pmaddubsw m3, m1, m6
31371 pmulhrsw m3, m7
31372 pmaddubsw m5, m4, m6
31373 pmulhrsw m5, m7
31374 packuswb m3, m5
31375 movu [r0 + 2023 * 16], m3
31376
31377 ; mode 31 [row 31]
31378 movu m0, [r3 + 18]
31379 movd m1, [r3 + 19]
31380 palignr m1, m0, 1
31381 punpcklbw m0, m1
31382 movu m2, [r3 + 26]
31383 movd m3, [r3 + 27]
31384 palignr m3, m2, 1
31385 punpcklbw m2, m3
31386 movu m1, [r3 + 34]
31387 movd m3, [r3 + 35]
31388 palignr m3, m1, 1
31389 punpcklbw m1, m3
31390 movu m4, [r3 + 42]
31391 movd m5, [r3 + 43]
31392 palignr m5, m4, 1
31393 punpcklbw m4, m5
31394
31395 pshufb m5, m0, [tab_S2]
31396 movh [r0 + 1918 * 16], m5
31397 pshufb m5, m2, [tab_S2]
31398 movh [r0 + 1918 * 16 + 8], m5
31399 pshufb m5, m1, [tab_S2]
31400 movh [r0 + 1919 * 16], m5
31401 pshufb m5, m4, [tab_S2]
31402 movh [r0 + 1919 * 16 + 8], m5
31403
31404 ; mode 32 [row 25]
31405 movu m6, [r5 + 2 * 16]
31406 pmaddubsw m3, m0, m6
31407 pmulhrsw m3, m7
31408 pmaddubsw m5, m2, m6
31409 pmulhrsw m5, m7
31410 packuswb m3, m5
31411 movu [r0 + 1970 * 16], m3
31412
31413 ; mode 33 [row 20 - first half]
31414 movu [r0 + 2024 * 16], m3
31415
31416 pmaddubsw m3, m1, m6
31417 pmulhrsw m3, m7
31418 pmaddubsw m5, m4, m6
31419 pmulhrsw m5, m7
31420 packuswb m3, m5
31421 movu [r0 + 1971 * 16], m3
31422
31423 ; mode 33 [row 20 - second half]
31424 movu [r0 + 2025 * 16], m3
31425
31426 ; mode 32 [row 26]
31427 movu m6, [r5 + 23 * 16]
31428 pmaddubsw m3, m0, m6
31429 pmulhrsw m3, m7
31430 pmaddubsw m5, m2, m6
31431 pmulhrsw m5, m7
31432 packuswb m3, m5
31433 movu [r0 + 1972 * 16], m3
31434 pmaddubsw m3, m1, m6
31435 pmulhrsw m3, m7
31436 pmaddubsw m5, m4, m6
31437 pmulhrsw m5, m7
31438 packuswb m3, m5
31439 movu [r0 + 1973 * 16], m3
31440
31441 ; mode 33 [row 21]
31442 movu m6, [r5 + 28 * 16]
31443 pmaddubsw m3, m0, m6
31444 pmulhrsw m3, m7
31445 pmaddubsw m5, m2, m6
31446 pmulhrsw m5, m7
31447 packuswb m3, m5
31448 movu [r0 + 2026 * 16], m3
31449 pmaddubsw m3, m1, m6
31450 pmulhrsw m3, m7
31451 pmaddubsw m5, m4, m6
31452 pmulhrsw m5, m7
31453 packuswb m3, m5
31454 movu [r0 + 2027 * 16], m3
31455
31456 ; mode 32 [row 27]
31457 movu m6, [r5 + 12 * 16]
31458 movu m0, [r3 + 19]
31459 movd m1, [r3 + 20]
31460 palignr m1, m0, 1
31461 punpcklbw m0, m1
31462 pmaddubsw m3, m0, m6
31463 pmulhrsw m3, m7
31464 movu m2, [r3 + 27]
31465 movd m4, [r3 + 28]
31466 palignr m4, m2, 1
31467 punpcklbw m2, m4
31468 pmaddubsw m5, m2, m6
31469 pmulhrsw m5, m7
31470 packuswb m3, m5
31471 movu [r0 + 1974 * 16], m3
31472
31473 movu m1, [r3 + 35]
31474 movd m3, [r3 + 36]
31475 palignr m3, m1, 1
31476 punpcklbw m1, m3
31477 pmaddubsw m3, m1, m6
31478 pmulhrsw m3, m7
31479 movu m4, [r3 + 43]
31480 movd m5, [r3 + 44]
31481 palignr m5, m4, 1
31482 punpcklbw m4, m5
31483 pmaddubsw m5, m4, m6
31484 pmulhrsw m5, m7
31485 packuswb m3, m5
31486 movu [r0 + 1975 * 16], m3
31487
31488 ; mode 33 [row 22]
31489 movu m6, [r5 + 22 * 16]
31490 pmaddubsw m3, m0, m6
31491 pmulhrsw m3, m7
31492 pmaddubsw m5, m2, m6
31493 pmulhrsw m5, m7
31494 packuswb m3, m5
31495 movu [r0 + 2028 * 16], m3
31496 pmaddubsw m3, m1, m6
31497 pmulhrsw m3, m7
31498 pmaddubsw m5, m4, m6
31499 pmulhrsw m5, m7
31500 packuswb m3, m5
31501 movu [r0 + 2029 * 16], m3
31502
31503 ; mode 32 [row 28]
31504 movu m6, [r5 + 1 * 16]
31505 movu m0, [r3 + 20]
31506 movd m1, [r3 + 21]
31507 palignr m1, m0, 1
31508 punpcklbw m0, m1
31509 pmaddubsw m3, m0, m6
31510 pmulhrsw m3, m7
31511 movu m2, [r3 + 28]
31512 movd m4, [r3 + 29]
31513 palignr m4, m2, 1
31514 punpcklbw m2, m4
31515 pmaddubsw m5, m2, m6
31516 pmulhrsw m5, m7
31517 packuswb m3, m5
31518 movu [r0 + 1976 * 16], m3
31519
31520 movu m1, [r3 + 36]
31521 movd m3, [r3 + 37]
31522 palignr m3, m1, 1
31523 punpcklbw m1, m3
31524 pmaddubsw m3, m1, m6
31525 pmulhrsw m3, m7
31526 movu m4, [r3 + 44]
31527 movd m5, [r3 + 45]
31528 palignr m5, m4, 1
31529 punpcklbw m4, m5
31530 pmaddubsw m5, m4, m6
31531 pmulhrsw m5, m7
31532 packuswb m3, m5
31533 movu [r0 + 1977 * 16], m3
31534
31535 ; mode 32 [row 29]
31536 movu m6, [r5 + 22 * 16]
31537 pmaddubsw m3, m0, m6
31538 pmulhrsw m3, m7
31539 pmaddubsw m5, m2, m6
31540 pmulhrsw m5, m7
31541 packuswb m3, m5
31542 movu [r0 + 1978 * 16], m3
31543 pmaddubsw m3, m1, m6
31544 pmulhrsw m3, m7
31545 pmaddubsw m5, m4, m6
31546 pmulhrsw m5, m7
31547 packuswb m3, m5
31548 movu [r0 + 1979 * 16], m3
31549
31550 ; mode 33 [row 23]
31551 movu m6, [r5 + 16 * 16]
31552 pmaddubsw m3, m0, m6
31553 pmulhrsw m3, m7
31554 pmaddubsw m5, m2, m6
31555 pmulhrsw m5, m7
31556 packuswb m3, m5
31557 movu [r0 + 2030 * 16], m3
31558 pmaddubsw m3, m1, m6
31559 pmulhrsw m3, m7
31560 pmaddubsw m5, m4, m6
31561 pmulhrsw m5, m7
31562 packuswb m3, m5
31563 movu [r0 + 2031 * 16], m3
31564
31565 ; mode 32 [row 30]
31566 movu m6, [r5 + 11 * 16]
31567 movu m0, [r3 + 21]
31568 movd m1, [r3 + 22]
31569 palignr m1, m0, 1
31570 punpcklbw m0, m1
31571 pmaddubsw m3, m0, m6
31572 pmulhrsw m3, m7
31573 movu m2, [r3 + 29]
31574 movd m4, [r3 + 30]
31575 palignr m4, m2, 1
31576 punpcklbw m2, m4
31577 pmaddubsw m5, m2, m6
31578 pmulhrsw m5, m7
31579 packuswb m3, m5
31580 movu [r0 + 1980 * 16], m3
31581
31582 movu m1, [r3 + 37]
31583 movd m3, [r3 + 38]
31584 palignr m3, m1, 1
31585 punpcklbw m1, m3
31586 pmaddubsw m3, m1, m6
31587 pmulhrsw m3, m7
31588 movu m4, [r3 + 45]
31589 movd m5, [r3 + 46]
31590 palignr m5, m4, 1
31591 punpcklbw m4, m5
31592 pmaddubsw m5, m4, m6
31593 pmulhrsw m5, m7
31594 packuswb m3, m5
31595 movu [r0 + 1981 * 16], m3
31596
31597 ; mode 33 [row 24]
31598 movu m6, [r5 + 10 * 16]
31599 pmaddubsw m3, m0, m6
31600 pmulhrsw m3, m7
31601 pmaddubsw m5, m2, m6
31602 pmulhrsw m5, m7
31603 packuswb m3, m5
31604 movu [r0 + 2032 * 16], m3
31605 pmaddubsw m3, m1, m6
31606 pmulhrsw m3, m7
31607 pmaddubsw m5, m4, m6
31608 pmulhrsw m5, m7
31609 packuswb m3, m5
31610 movu [r0 + 2033 * 16], m3
31611
31612 ; mode 32 [row 31]
31613 movu m0, [r3 + 22]
31614 movd m1, [r3 + 23]
31615 palignr m1, m0, 1
31616 punpcklbw m0, m1
31617 movu m2, [r3 + 30]
31618 movd m3, [r3 + 31]
31619 palignr m3, m2, 1
31620 punpcklbw m2, m3
31621 movu m1, [r3 + 38]
31622 movd m3, [r3 + 39]
31623 palignr m3, m1, 1
31624 punpcklbw m1, m3
31625 movu m4, [r3 + 46]
31626 movd m5, [r3 + 47]
31627 palignr m5, m4, 1
31628 punpcklbw m4, m5
31629
31630 pshufb m5, m0, [tab_S2]
31631 movh [r0 + 1982 * 16], m5
31632 pshufb m5, m2, [tab_S2]
31633 movh [r0 + 1982 * 16 + 8], m5
31634 pshufb m5, m1, [tab_S2]
31635 movh [r0 + 1983 * 16], m5
31636 pshufb m5, m4, [tab_S2]
31637 movh [r0 + 1983 * 16 + 8], m5
31638
31639 ; mode 33 [row 25]
31640 movu m6, [r5 + 4 * 16]
31641 pmaddubsw m3, m0, m6
31642 pmulhrsw m3, m7
31643 pmaddubsw m5, m2, m6
31644 pmulhrsw m5, m7
31645 packuswb m3, m5
31646 movu [r0 + 2034 * 16], m3
31647 pmaddubsw m3, m1, m6
31648 pmulhrsw m3, m7
31649 pmaddubsw m5, m4, m6
31650 pmulhrsw m5, m7
31651 packuswb m3, m5
31652 movu [r0 + 2035 * 16], m3
31653
31654 ; mode 33 [row 26]
31655 movu m6, [r5 + 30 * 16]
31656 pmaddubsw m3, m0, m6
31657 pmulhrsw m3, m7
31658 pmaddubsw m5, m2, m6
31659 pmulhrsw m5, m7
31660 packuswb m3, m5
31661 movu [r0 + 2036 * 16], m3
31662 pmaddubsw m3, m1, m6
31663 pmulhrsw m3, m7
31664 pmaddubsw m5, m4, m6
31665 pmulhrsw m5, m7
31666 packuswb m3, m5
31667 movu [r0 + 2037 * 16], m3
31668
31669 ; mode 33 [row 27]
31670 movu m6, [r5 + 24 * 16]
31671 movu m0, [r3 + 23]
31672 movd m1, [r3 + 24]
31673 palignr m1, m0, 1
31674 punpcklbw m0, m1
31675 pmaddubsw m3, m0, m6
31676 pmulhrsw m3, m7
31677 movu m2, [r3 + 31]
31678 movd m4, [r3 + 32]
31679 palignr m4, m2, 1
31680 punpcklbw m2, m4
31681 pmaddubsw m5, m2, m6
31682 pmulhrsw m5, m7
31683 packuswb m3, m5
31684 movu [r0 + 2038 * 16], m3
31685
31686 movu m1, [r3 + 39]
31687 movd m3, [r3 + 40]
31688 palignr m3, m1, 1
31689 punpcklbw m1, m3
31690 pmaddubsw m3, m1, m6
31691 pmulhrsw m3, m7
31692 movu m4, [r3 + 47]
31693 movd m5, [r3 + 48]
31694 palignr m5, m4, 1
31695 punpcklbw m4, m5
31696 pmaddubsw m5, m4, m6
31697 pmulhrsw m5, m7
31698 packuswb m3, m5
31699 movu [r0 + 2039 * 16], m3
31700
31701 ; mode 33 [row 28]
31702 movu m6, [r5 + 18 * 16]
31703 movu m0, [r3 + 24]
31704 movd m1, [r3 + 25]
31705 palignr m1, m0, 1
31706 punpcklbw m0, m1
31707 pmaddubsw m3, m0, m6
31708 pmulhrsw m3, m7
31709 movu m2, [r3 + 32]
31710 movd m4, [r3 + 33]
31711 palignr m4, m2, 1
31712 punpcklbw m2, m4
31713 pmaddubsw m5, m2, m6
31714 pmulhrsw m5, m7
31715 packuswb m3, m5
31716 movu [r0 + 2040 * 16], m3
31717
31718 movu m1, [r3 + 40]
31719 movd m3, [r3 + 41]
31720 palignr m3, m1, 1
31721 punpcklbw m1, m3
31722 pmaddubsw m3, m1, m6
31723 pmulhrsw m3, m7
31724 movu m4, [r3 + 48]
31725 movd m5, [r3 + 49]
31726 palignr m5, m4, 1
31727 punpcklbw m4, m5
31728 pmaddubsw m5, m4, m6
31729 pmulhrsw m5, m7
31730 packuswb m3, m5
31731 movu [r0 + 2041 * 16], m3
31732
31733 ; mode 33 [row 29]
31734 movu m6, [r5 + 12 * 16]
31735 movu m0, [r3 + 25]
31736 movd m1, [r3 + 26]
31737 palignr m1, m0, 1
31738 punpcklbw m0, m1
31739 pmaddubsw m3, m0, m6
31740 pmulhrsw m3, m7
31741 movu m2, [r3 + 33]
31742 movd m4, [r3 + 34]
31743 palignr m4, m2, 1
31744 punpcklbw m2, m4
31745 pmaddubsw m5, m2, m6
31746 pmulhrsw m5, m7
31747 packuswb m3, m5
31748 movu [r0 + 2042 * 16], m3
31749
31750 movu m1, [r3 + 41]
31751 movd m3, [r3 + 42]
31752 palignr m3, m1, 1
31753 punpcklbw m1, m3
31754 pmaddubsw m3, m1, m6
31755 pmulhrsw m3, m7
31756 movu m4, [r3 + 49]
31757 movd m5, [r3 + 50]
31758 palignr m5, m4, 1
31759 punpcklbw m4, m5
31760 pmaddubsw m5, m4, m6
31761 pmulhrsw m5, m7
31762 packuswb m3, m5
31763 movu [r0 + 2043 * 16], m3
31764
31765 ; mode 33 [row 30]
31766 movu m6, [r5 + 6 * 16]
31767 movu m0, [r3 + 26]
31768 movd m1, [r3 + 27]
31769 palignr m1, m0, 1
31770 punpcklbw m0, m1
31771 pmaddubsw m3, m0, m6
31772 pmulhrsw m3, m7
31773 movu m2, [r3 + 34]
31774 movd m4, [r3 + 35]
31775 palignr m4, m2, 1
31776 punpcklbw m2, m4
31777 pmaddubsw m5, m2, m6
31778 pmulhrsw m5, m7
31779 packuswb m3, m5
31780 movu [r0 + 2044 * 16], m3
31781
31782 movu m1, [r3 + 42]
31783 movd m3, [r3 + 43]
31784 palignr m3, m1, 1
31785 punpcklbw m1, m3
31786 pmaddubsw m3, m1, m6
31787 pmulhrsw m3, m7
31788 movu m4, [r3 + 50]
31789 movd m5, [r3 + 51]
31790 palignr m5, m4, 1
31791 punpcklbw m4, m5
31792 pmaddubsw m5, m4, m6
31793 pmulhrsw m5, m7
31794 packuswb m3, m5
31795 movu [r0 + 2045 * 16], m3
31796
31797 ; mode 33 [row 31]
31798 movu m5, [r3 + 27]
31799 movu [r0 + 2046 * 16], m5
31800 movu m5, [r3 + 43]
31801 movu [r0 + 2047 * 16], m5
31802
31803 ;mode 34 [row 0]
31804 movu m0, [r3 + 2]
31805 movu [r0 + 2048 * 16], m0
31806 movu m1, [r3 + 18]
31807 movu [r0 + 2049 * 16], m1
31808
31809 ;mode 34 [row 1]
31810 movu m2, [r3 + 34]
31811 palignr m3, m1, m0, 1
31812 movu [r0 + 2050 * 16], m3
31813 palignr m4, m2, m1, 1
31814 movu [r0 + 2051 * 16], m4
31815
31816 ;mode 34 [row 2]
31817 palignr m3, m1, m0, 2
31818 movu [r0 + 2052 * 16], m3
31819 palignr m4, m2, m1, 2
31820 movu [r0 + 2053 * 16], m4
31821
31822 ;mode 34 [row 3]
31823 palignr m3, m1, m0, 3
31824 movu [r0 + 2054 * 16], m3
31825 palignr m4, m2, m1, 3
31826 movu [r0 + 2055 * 16], m4
31827
31828 ;mode 34 [row 4]
31829 palignr m3, m1, m0, 4
31830 movu [r0 + 2056 * 16], m3
31831 palignr m4, m2, m1, 4
31832 movu [r0 + 2057 * 16], m4
31833
31834 ;mode 34 [row 5]
31835 palignr m3, m1, m0, 5
31836 movu [r0 + 2058 * 16], m3
31837 palignr m4, m2, m1, 5
31838 movu [r0 + 2059 * 16], m4
31839
31840 ;mode 34 [row 6]
31841 palignr m3, m1, m0, 6
31842 movu [r0 + 2060 * 16], m3
31843 palignr m4, m2, m1, 6
31844 movu [r0 + 2061 * 16], m4
31845
31846 ;mode 34 [row 7]
31847 palignr m3, m1, m0, 7
31848 movu [r0 + 2062 * 16], m3
31849 palignr m4, m2, m1, 7
31850 movu [r0 + 2063 * 16], m4
31851
31852 ;mode 34 [row 8]
31853 palignr m3, m1, m0, 8
31854 movu [r0 + 2064 * 16], m3
31855 palignr m4, m2, m1, 8
31856 movu [r0 + 2065 * 16], m4
31857
31858 ;mode 34 [row 9]
31859 palignr m3, m1, m0, 9
31860 movu [r0 + 2066 * 16], m3
31861 palignr m4, m2, m1, 9
31862 movu [r0 + 2067 * 16], m4
31863
31864 ;mode 34 [row 10]
31865 palignr m3, m1, m0, 10
31866 movu [r0 + 2068 * 16], m3
31867 palignr m4, m2, m1, 10
31868 movu [r0 + 2069 * 16], m4
31869
31870 ;mode 34 [row 11]
31871 palignr m3, m1, m0, 11
31872 movu [r0 + 2070 * 16], m3
31873 palignr m4, m2, m1, 11
31874 movu [r0 + 2071 * 16], m4
31875
31876 ;mode 34 [row 12]
31877 palignr m3, m1, m0, 12
31878 movu [r0 + 2072 * 16], m3
31879 palignr m4, m2, m1, 12
31880 movu [r0 + 2073 * 16], m4
31881
31882 ;mode 34 [row 13]
31883 palignr m3, m1, m0, 13
31884 movu [r0 + 2074 * 16], m3
31885 palignr m4, m2, m1, 13
31886 movu [r0 + 2075 * 16], m4
31887
31888 ;mode 34 [row 14]
31889 palignr m3, m1, m0, 14
31890 movu [r0 + 2076 * 16], m3
31891 palignr m4, m2, m1, 14
31892 movu [r0 + 2077 * 16], m4
31893
31894 ;mode 34 [row 15]
31895 palignr m3, m1, m0, 15
31896 movu [r0 + 2078 * 16], m3
31897 palignr m4, m2, m1, 15
31898 movu [r0 + 2079 * 16], m4
31899
31900 ;mode 34 [row 16]
31901 palignr m3, m1, m0, 16
31902 movu [r0 + 2080 * 16], m3
31903 palignr m4, m2, m1, 16
31904 movu [r0 + 2081 * 16], m4
31905
31906 ;mode 34 [row 17]
31907 movu m0, [r3 + 19]
31908 movu [r0 + 2082 * 16], m0
31909 movu m1, [r3 + 35]
31910 movu [r0 + 2083 * 16], m1
31911
31912 ;mode 34 [row 18]
31913 movu m2, [r3 + 51]
31914 palignr m3, m1, m0, 1
31915 movu [r0 + 2084 * 16], m3
31916 palignr m4, m2, m1, 1
31917 movu [r0 + 2085 * 16], m4
31918
31919 ;mode 34 [row 19]
31920 palignr m3, m1, m0, 2
31921 movu [r0 + 2086 * 16], m3
31922 palignr m4, m2, m1, 2
31923 movu [r0 + 2087 * 16], m4
31924
31925 ;mode 34 [row 20]
31926 palignr m3, m1, m0, 3
31927 movu [r0 + 2088 * 16], m3
31928 palignr m4, m2, m1, 3
31929 movu [r0 + 2089 * 16], m4
31930
31931 ;mode 34 [row 21]
31932 palignr m3, m1, m0, 4
31933 movu [r0 + 2090 * 16], m3
31934 palignr m4, m2, m1, 4
31935 movu [r0 + 2091 * 16], m4
31936
31937 ;mode 34 [row 22]
31938 palignr m3, m1, m0, 5
31939 movu [r0 + 2092 * 16], m3
31940 palignr m4, m2, m1, 5
31941 movu [r0 + 2093 * 16], m4
31942
31943 ;mode 34 [row 23]
31944 palignr m3, m1, m0, 6
31945 movu [r0 + 2094 * 16], m3
31946 palignr m4, m2, m1, 6
31947 movu [r0 + 2095 * 16], m4
31948
31949 ;mode 34 [row 24]
31950 palignr m3, m1, m0, 7
31951 movu [r0 + 2096 * 16], m3
31952 palignr m4, m2, m1, 7
31953 movu [r0 + 2097 * 16], m4
31954
31955 ;mode 34 [row 25]
31956 palignr m3, m1, m0, 8
31957 movu [r0 + 2098 * 16], m3
31958 palignr m4, m2, m1, 8
31959 movu [r0 + 2099 * 16], m4
31960
31961 ;mode 34 [row 26]
31962 palignr m3, m1, m0, 9
31963 movu [r0 + 2100 * 16], m3
31964 palignr m4, m2, m1, 9
31965 movu [r0 + 2101 * 16], m4
31966
31967 ;mode 34 [row 27]
31968 palignr m3, m1, m0, 10
31969 movu [r0 + 2102 * 16], m3
31970 palignr m4, m2, m1, 10
31971 movu [r0 + 2103 * 16], m4
31972
31973 ;mode 34 [row 28]
31974 palignr m3, m1, m0, 11
31975 movu [r0 + 2104 * 16], m3
31976 palignr m4, m2, m1, 11
31977 movu [r0 + 2105 * 16], m4
31978
31979 ;mode 34 [row 29]
31980 palignr m3, m1, m0, 12
31981 movu [r0 + 2106 * 16], m3
31982 palignr m4, m2, m1, 12
31983 movu [r0 + 2107 * 16], m4
31984
31985 ;mode 34 [row 30]
31986 palignr m3, m1, m0, 13
31987 movu [r0 + 2108 * 16], m3
31988 palignr m4, m2, m1, 13
31989 movu [r0 + 2109 * 16], m4
31990
31991 ;mode 34 [row 31]
31992 palignr m3, m1, m0, 14
31993 movu [r0 + 2110 * 16], m3
31994 palignr m4, m2, m1, 14
31995 movu [r0 + 2111 * 16], m4
31996
31997 RET