Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / intrapred16.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* Copyright (C) 2013 x265 project
3;*
4;* Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
5;* Yuvaraj Venkatesh <yuvaraj@multicorewareinc.com>
6;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
7;*
8;* This program is free software; you can redistribute it and/or modify
9;* it under the terms of the GNU General Public License as published by
10;* the Free Software Foundation; either version 2 of the License, or
11;* (at your option) any later version.
12;*
13;* This program is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16;* GNU General Public License for more details.
17;*
18;* You should have received a copy of the GNU General Public License
19;* along with this program; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21;*
22;* This program is also available under a commercial proprietary license.
23;* For more information, contact us at license @ x265.com.
24;*****************************************************************************/
25
26%include "x86inc.asm"
27%include "x86util.asm"
28
29SECTION_RODATA 32
30
31const ang_table
32%assign x 0
33%rep 32
34 times 4 dw (32-x), x
35%assign x x+1
36%endrep
37
38const shuf_mode_13_23, db 0, 0, 14, 15, 6, 7, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
39const shuf_mode_14_22, db 14, 15, 10, 11, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
40const shuf_mode_15_21, db 12, 13, 8, 9, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
41const shuf_mode_16_20, db 2, 3, 0, 1, 14, 15, 12, 13, 8, 9, 6, 7, 2, 3, 0, 1
42const shuf_mode_17_19, db 0, 1, 14, 15, 12, 13, 10, 11, 6, 7, 4, 5, 2, 3, 0, 1
43const shuf_mode32_18, db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
44const pw_punpcklwd, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
45const c_mode32_10_0, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
46
47const pw_unpackwdq, times 8 db 0,1
48const pw_ang8_12, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
49const pw_ang8_13, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
50const pw_ang8_14, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1
51const pw_ang8_15, db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1
52const pw_ang8_16, db 0, 0, 0, 0, 0, 0, 12, 13, 10, 11, 6, 7, 4, 5, 0, 1
53const pw_ang8_17, db 0, 0, 14, 15, 12, 13, 10, 11, 8, 9, 4, 5, 2, 3, 0, 1
54const pw_swap16, db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
55
56const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
57const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1
58
59SECTION .text
60
61cextern pw_1
62cextern pw_8
63cextern pw_1023
64cextern pd_16
65cextern pd_32
66cextern pw_4096
67cextern multiL
68cextern multiH
69cextern multi_2Row
70cextern pw_swap
71cextern pb_unpackwq1
72cextern pb_unpackwq2
73
74;-------------------------------------------------------------------------------------------------------
75; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
76;-------------------------------------------------------------------------------------------------------
77INIT_XMM sse4
78cglobal intra_pred_dc4, 4,6,2
79 mov r4d, r5m
80 add r2, 2
81 add r3, 2
82
83 movh m0, [r3] ; sumAbove
84 movh m1, [r2] ; sumLeft
85
86 paddw m0, m1
87 pshufd m1, m0, 1
88 paddw m0, m1
89 phaddw m0, m0 ; m0 = sum
90
91 test r4d, r4d
92
93 pmulhrsw m0, [pw_4096] ; m0 = (sum + 4) / 8
94 movd r4d, m0 ; r4d = dc_val
95 movzx r4d, r4w
96 pshuflw m0, m0, 0 ; m0 = word [dc_val ...]
97
98 ; store DC 4x4
99 movh [r0], m0
100 movh [r0 + r1 * 2], m0
101 movh [r0 + r1 * 4], m0
102 lea r5, [r0 + r1 * 4]
103 movh [r5 + r1 * 2], m0
104
105 ; do DC filter
106 jz .end
107 lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
108 add r4d, r5d ; r4d = DC * 3 + 2
109 movd m0, r4d
110 pshuflw m0, m0, 0 ; m0 = pixDCx3
111
112 ; filter top
113 movu m1, [r3]
114 paddw m1, m0
115 psraw m1, 2
116 movh [r0], m1 ; overwrite top-left pixel, we will update it later
117
118 ; filter top-left
119 movzx r3d, word [r3]
120 add r5d, r3d
121 movzx r3d, word [r2]
122 add r3d, r5d
123 shr r3d, 2
124 mov [r0], r3w
125
126 ; filter left
127 lea r0, [r0 + r1 * 2]
128 movu m1, [r2 + 2]
129 paddw m1, m0
130 psraw m1, 2
131 movd r3d, m1
132 mov [r0], r3w
133 shr r3d, 16
134 mov [r0 + r1 * 2], r3w
135 pextrw [r0 + r1 * 4], m1, 2
136
137.end:
138
139 RET
140
141
142
143;-------------------------------------------------------------------------------------------------------
144; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
145;-------------------------------------------------------------------------------------------------------
146INIT_XMM sse4
147cglobal intra_pred_dc8, 4, 7, 2
148 mov r4d, r5m
149 add r2, 2
150 add r3, 2
151 add r1, r1
152 movu m0, [r3]
153 movu m1, [r2]
154
155 paddw m0, m1
156 movhlps m1, m0
157 paddw m0, m1
158 phaddw m0, m0
159 pmaddwd m0, [pw_1]
160
161 movd r5d, m0
162 add r5d, 8
163 shr r5d, 4 ; sum = sum / 16
164 movd m1, r5d
165 pshuflw m1, m1, 0 ; m1 = word [dc_val ...]
166 pshufd m1, m1, 0
167
168 test r4d, r4d
169
170 ; store DC 8x8
171 mov r6, r0
172 movu [r0], m1
173 movu [r0 + r1], m1
174 movu [r0 + r1 * 2], m1
175 lea r0, [r0 + r1 * 2]
176 movu [r0 + r1], m1
177 movu [r0 + r1 * 2], m1
178 lea r0, [r0 + r1 * 2]
179 movu [r0 + r1], m1
180 movu [r0 + r1 * 2], m1
181 lea r0, [r0 + r1 * 2]
182 movu [r0 + r1], m1
183
184 ; Do DC Filter
185 jz .end
186 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
187 add r5d, r4d ; r5d = DC * 3 + 2
188 movd m1, r5d
189 pshuflw m1, m1, 0 ; m1 = pixDCx3
190 pshufd m1, m1, 0
191
192 ; filter top
193 movu m0, [r3]
194 paddw m0, m1
195 psraw m0, 2
196 movu [r6], m0
197
198 ; filter top-left
199 movzx r3d, word [r3]
200 add r4d, r3d
201 movzx r3d, word [r2]
202 add r3d, r4d
203 shr r3d, 2
204 mov [r6], r3w
205
206 ; filter left
207 add r6, r1
208 movu m0, [r2 + 2]
209 paddw m0, m1
210 psraw m0, 2
211 pextrw [r6], m0, 0
212 pextrw [r6 + r1], m0, 1
213 pextrw [r6 + r1 * 2], m0, 2
214 lea r6, [r6 + r1 * 2]
215 pextrw [r6 + r1], m0, 3
216 pextrw [r6 + r1 * 2], m0, 4
217 lea r6, [r6 + r1 * 2]
218 pextrw [r6 + r1], m0, 5
219 pextrw [r6 + r1 * 2], m0, 6
220
221.end:
222 RET
223
224
225;-------------------------------------------------------------------------------------------------------
226; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
227;-------------------------------------------------------------------------------------------------------
228INIT_XMM sse4
229cglobal intra_pred_dc16, 4, 7, 4
230 mov r4d, r5m
231 add r2, 2
232 add r3, 2
233 add r1, r1
234 movu m0, [r3]
235 movu m1, [r3 + 16]
236 movu m2, [r2]
237 movu m3, [r2 + 16]
238
239 paddw m0, m1
240 paddw m2, m3
241 paddw m0, m2
242 movhlps m1, m0
243 paddw m0, m1
244 phaddw m0, m0
245 pmaddwd m0, [pw_1]
246
247 movd r5d, m0
248 add r5d, 16
249 shr r5d, 5 ; sum = sum / 16
250 movd m1, r5d
251 pshuflw m1, m1, 0 ; m1 = word [dc_val ...]
252 pshufd m1, m1, 0
253
254 test r4d, r4d
255
256 ; store DC 16x16
257 mov r6, r0
258 movu [r0], m1
259 movu [r0 + 16], m1
260 movu [r0 + r1], m1
261 movu [r0 + 16 + r1], m1
262 lea r0, [r0 + r1 * 2]
263 movu [r0], m1
264 movu [r0 + 16], m1
265 movu [r0 + r1], m1
266 movu [r0 + 16 + r1], m1
267 lea r0, [r0 + r1 * 2]
268 movu [r0], m1
269 movu [r0 + 16], m1
270 movu [r0 + r1], m1
271 movu [r0 + 16 + r1], m1
272 lea r0, [r0 + r1 * 2]
273 movu [r0], m1
274 movu [r0 + 16], m1
275 movu [r0 + r1], m1
276 movu [r0 + 16 + r1], m1
277 lea r0, [r0 + r1 * 2]
278 movu [r0], m1
279 movu [r0 + 16], m1
280 movu [r0 + r1], m1
281 movu [r0 + 16 + r1], m1
282 lea r0, [r0 + r1 * 2]
283 movu [r0], m1
284 movu [r0 + 16], m1
285 movu [r0 + r1], m1
286 movu [r0 + 16 + r1], m1
287 lea r0, [r0 + r1 * 2]
288 movu [r0], m1
289 movu [r0 + 16], m1
290 movu [r0 + r1], m1
291 movu [r0 + 16 + r1], m1
292 lea r0, [r0 + r1 * 2]
293 movu [r0], m1
294 movu [r0 + 16], m1
295 movu [r0 + r1], m1
296 movu [r0 + 16 + r1], m1
297
298 ; Do DC Filter
299 jz .end
300 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
301 add r5d, r4d ; r5d = DC * 3 + 2
302 movd m1, r5d
303 pshuflw m1, m1, 0 ; m1 = pixDCx3
304 pshufd m1, m1, 0
305
306 ; filter top
307 movu m2, [r3]
308 paddw m2, m1
309 psraw m2, 2
310 movu [r6], m2
311 movu m3, [r3 + 16]
312 paddw m3, m1
313 psraw m3, 2
314 movu [r6 + 16], m3
315
316 ; filter top-left
317 movzx r3d, word [r3]
318 add r4d, r3d
319 movzx r3d, word [r2]
320 add r3d, r4d
321 shr r3d, 2
322 mov [r6], r3w
323
324 ; filter left
325 add r6, r1
326 movu m2, [r2 + 2]
327 paddw m2, m1
328 psraw m2, 2
329
330 pextrw [r6], m2, 0
331 pextrw [r6 + r1], m2, 1
332 lea r6, [r6 + r1 * 2]
333 pextrw [r6], m2, 2
334 pextrw [r6 + r1], m2, 3
335 lea r6, [r6 + r1 * 2]
336 pextrw [r6], m2, 4
337 pextrw [r6 + r1], m2, 5
338 lea r6, [r6 + r1 * 2]
339 pextrw [r6], m2, 6
340 pextrw [r6 + r1], m2, 7
341
342 lea r6, [r6 + r1 * 2]
343 movu m3, [r2 + 18]
344 paddw m3, m1
345 psraw m3, 2
346
347 pextrw [r6], m3, 0
348 pextrw [r6 + r1], m3, 1
349 lea r6, [r6 + r1 * 2]
350 pextrw [r6], m3, 2
351 pextrw [r6 + r1], m3, 3
352 lea r6, [r6 + r1 * 2]
353 pextrw [r6], m3, 4
354 pextrw [r6 + r1], m3, 5
355 lea r6, [r6 + r1 * 2]
356 pextrw [r6], m3, 6
357
358.end:
359 RET
360
361
362;-------------------------------------------------------------------------------------------
363; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
364;-------------------------------------------------------------------------------------------
365INIT_XMM sse4
366cglobal intra_pred_dc32, 4, 5, 6
367 mov r4d, r5m
368 add r2, 2
369 add r3, 2
370 add r1, r1
371 movu m0, [r3]
372 movu m1, [r3 + 16]
373 movu m2, [r3 + 32]
374 movu m3, [r3 + 48]
375 paddw m0, m1
376 paddw m2, m3
377 paddw m0, m2
378 movu m1, [r2]
379 movu m3, [r2 + 16]
380 movu m4, [r2 + 32]
381 movu m5, [r2 + 48]
382 paddw m1, m3
383 paddw m4, m5
384 paddw m1, m4
385 paddw m0, m1
386 movhlps m1, m0
387 paddw m0, m1
388 phaddw m0, m0
389 pmaddwd m0, [pw_1]
390
391 paddd m0, [pd_32] ; sum = sum + 32
392 psrld m0, 6 ; sum = sum / 64
393 pshuflw m0, m0, 0
394 pshufd m0, m0, 0
395
396 lea r2, [r1 * 3]
397 mov r3d, 4
398.loop:
399 ; store DC 32x32
400 movu [r0 + 0], m0
401 movu [r0 + 16], m0
402 movu [r0 + 32], m0
403 movu [r0 + 48], m0
404 movu [r0 + r1 + 0], m0
405 movu [r0 + r1 + 16], m0
406 movu [r0 + r1 + 32], m0
407 movu [r0 + r1 + 48], m0
408 movu [r0 + r1 * 2 + 0], m0
409 movu [r0 + r1 * 2 + 16], m0
410 movu [r0 + r1 * 2 + 32], m0
411 movu [r0 + r1 * 2 + 48], m0
412 movu [r0 + r2 + 0], m0
413 movu [r0 + r2 + 16], m0
414 movu [r0 + r2 + 32], m0
415 movu [r0 + r2 + 48], m0
416 lea r0, [r0 + r1 * 4]
417 movu [r0 + 0], m0
418 movu [r0 + 16], m0
419 movu [r0 + 32], m0
420 movu [r0 + 48], m0
421 movu [r0 + r1 + 0], m0
422 movu [r0 + r1 + 16], m0
423 movu [r0 + r1 + 32], m0
424 movu [r0 + r1 + 48], m0
425 movu [r0 + r1 * 2 + 0], m0
426 movu [r0 + r1 * 2 + 16], m0
427 movu [r0 + r1 * 2 + 32], m0
428 movu [r0 + r1 * 2 + 48], m0
429 movu [r0 + r2 + 0], m0
430 movu [r0 + r2 + 16], m0
431 movu [r0 + r2 + 32], m0
432 movu [r0 + r2 + 48], m0
433 lea r0, [r0 + r1 * 4]
434 dec r3d
435 jnz .loop
436 RET
437
438;-----------------------------------------------------------------------------------------------------------
439; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
440;-----------------------------------------------------------------------------------------------------------
441INIT_XMM sse4
442cglobal intra_pred_planar4, 4,7,5
443 add r2, 2
444 add r3, 2
445 add r1, r1
446 movh m0, [r3] ; topRow[i] = above[i];
447 punpcklqdq m0, m0
448
449 pxor m1, m1
450 movd m2, [r2 + 8] ; bottomLeft = left[4]
451 movzx r6d, word [r3 + 8] ; topRight = above[4];
452 pshuflw m2, m2, 0
453 pshufd m2, m2, 0
454
455 psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i]
456 psllw m0, 2
457 punpcklqdq m3, m2, m1
458 psubw m0, m3
459 paddw m2, m2
460
461%macro COMP_PRED_PLANAR_2ROW 1
462 movzx r4d, word [r2 + %1]
463 lea r4d, [r4d * 4 + 4]
464 movd m3, r4d
465 pshuflw m3, m3, 0
466
467 movzx r4d, word [r2 + %1 + 2]
468 lea r4d, [r4d * 4 + 4]
469 movd m4, r4d
470 pshuflw m4, m4, 0
471 punpcklqdq m3, m4 ; horPred
472
473 movzx r4d, word [r2 + %1]
474 mov r5d, r6d
475 sub r5d, r4d
476 movd m4, r5d
477 pshuflw m4, m4, 0
478
479 movzx r4d, word [r2 + %1 + 2]
480 mov r5d, r6d
481 sub r5d, r4d
482 movd m1, r5d
483 pshuflw m1, m1, 0
484 punpcklqdq m4, m1 ; rightColumnN
485
486 pmullw m4, [multi_2Row]
487 paddw m3, m4
488 paddw m0, m2
489 paddw m3, m0
490 psraw m3, 3
491
492 movh [r0], m3
493 pshufd m3, m3, 0xAE
494 movh [r0 + r1], m3
495 lea r0, [r0 + 2 * r1]
496%endmacro
497
498 COMP_PRED_PLANAR_2ROW 0
499 COMP_PRED_PLANAR_2ROW 4
500%undef COMP_PRED_PLANAR_2ROW
501 RET
502
503;-----------------------------------------------------------------------------------------------------------
504; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
505;-----------------------------------------------------------------------------------------------------------
506INIT_XMM sse4
507cglobal intra_pred_planar8, 4,4,7
508 add r2, 2
509 add r3, 2
510 add r1, r1
511 movu m1, [r3] ; v_topRow
512 movu m2, [r2] ; v_leftColumn
513
514 movd m3, [r3 + 16] ; topRight = above[8];
515 movd m4, [r2 + 16] ; bottomLeft = left[8];
516
517 pshuflw m3, m3, 0
518 pshufd m3, m3, 0
519 pshuflw m4, m4, 0
520 pshufd m4, m4, 0
521
522 psubw m4, m1 ; v_bottomRow
523 psubw m3, m2 ; v_rightColumn
524
525 psllw m1, 3 ; v_topRow
526 psllw m2, 3 ; v_leftColumn
527
528 paddw m6, m2, [pw_8]
529
530%macro PRED_PLANAR_ROW8 1
531 %if (%1 < 4)
532 pshuflw m5, m6, 0x55 * %1
533 pshufd m5, m5, 0
534 pshuflw m2, m3, 0x55 * %1
535 pshufd m2, m2, 0
536 %else
537 pshufhw m5, m6, 0x55 * (%1 - 4)
538 pshufd m5, m5, 0xAA
539 pshufhw m2, m3, 0x55 * (%1 - 4)
540 pshufd m2, m2, 0xAA
541 %endif
542
543 pmullw m2, [multiL]
544 paddw m5, m2
545 paddw m1, m4
546 paddw m5, m1
547 psraw m5, 4
548
549 movu [r0], m5
550 add r0, r1
551
552%endmacro
553
554 PRED_PLANAR_ROW8 0
555 PRED_PLANAR_ROW8 1
556 PRED_PLANAR_ROW8 2
557 PRED_PLANAR_ROW8 3
558 PRED_PLANAR_ROW8 4
559 PRED_PLANAR_ROW8 5
560 PRED_PLANAR_ROW8 6
561 PRED_PLANAR_ROW8 7
562
563%undef PRED_PLANAR_ROW8
564 RET
565
566
567;-----------------------------------------------------------------------------------------------------------
568; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
569;-----------------------------------------------------------------------------------------------------------
570INIT_XMM sse4
571%if (BIT_DEPTH == 12)
572
573%if (ARCH_X86_64 == 1)
574cglobal intra_pred_planar16, 4,7,8+3
575%define bottomRow0 m7
576%define bottomRow1 m8
577%define bottomRow2 m9
578%define bottomRow3 m10
579%else
580cglobal intra_pred_planar16, 4,7,8, 0-3*mmsize
581%define bottomRow0 [rsp + 0*mmsize]
582%define bottomRow1 [rsp + 1*mmsize]
583%define bottomRow2 [rsp + 2*mmsize]
584%define bottomRow3 m7
585%endif
586
587 add r2, 2
588 add r3, 2
589 add r1, r1
590
591 pxor m0, m0
592
593 ; bottomRow
594 movzx r4d, word [r2 + 16*2]
595 movd m1, r4d
596 pshufd m1, m1, 0 ; m1 = bottomLeft
597 movu m2, [r3]
598 pmovzxwd m3, m2
599 punpckhwd m2, m0
600 psubd m4, m1, m3
601 mova bottomRow0, m4
602 psubd m4, m1, m2
603 mova bottomRow1, m4
604 movu m2, [r3 + 16]
605 pmovzxwd m3, m2
606 punpckhwd m2, m0
607 psubd m4, m1, m3
608 mova bottomRow2, m4
609 psubd m1, m2
610 mova bottomRow3, m1
611
612 ; topRow
613 pmovzxwd m0, [r3 + 0*8]
614 pslld m0, 4
615 pmovzxwd m1, [r3 + 1*8]
616 pslld m1, 4
617 pmovzxwd m2, [r3 + 2*8]
618 pslld m2, 4
619 pmovzxwd m3, [r3 + 3*8]
620 pslld m3, 4
621
622 xor r6, r6
623.loopH:
624 movzx r4d, word [r2 + r6*2]
625 movzx r5d, word [r3 + 16*2] ; r5 = topRight
626 sub r5d, r4d
627 movd m5, r5d
628 pshuflw m5, m5, 0
629 pmullw m5, [multiL]
630 pmovsxwd m5, m5 ; m5 = rightCol
631 add r4d, r4d
632 lea r4d, [r4d * 8 + 16]
633 movd m4, r4d
634 pshufd m4, m4, 0 ; m4 = horPred
635 paddd m4, m5
636 pshufd m6, m5, 0xFF ; m6 = [4 4 4 4]
637
638 ; 0-3
639 paddd m0, bottomRow0
640 paddd m5, m0, m4
641 psrad m5, 5
642 packusdw m5, m5
643 movh [r0 + 0*8], m5
644
645 ; 4-7
646 paddd m4, m6
647 paddd m1, bottomRow1
648 paddd m5, m1, m4
649 psrad m5, 5
650 packusdw m5, m5
651 movh [r0 + 1*8], m5
652
653 ; 8-11
654 paddd m4, m6
655 paddd m2, bottomRow2
656 paddd m5, m2, m4
657 psrad m5, 5
658 packusdw m5, m5
659 movh [r0 + 2*8], m5
660
661 ; 12-15
662 paddd m4, m6
663 paddd m3, bottomRow3
664 paddd m5, m3, m4
665 psrad m5, 5
666 packusdw m5, m5
667 movh [r0 + 3*8], m5
668
669 add r0, r1
670 inc r6d
671 cmp r6d, 16
672 jnz .loopH
673 RET
674
675%else ; BIT_DEPTH == 10
676INIT_XMM sse4
677cglobal intra_pred_planar16, 4,6,7
678 add r2, 2
679 add r3, 2
680 add r1, r1
681
682 movu m1, [r3] ; topRow[0-7]
683 movu m2, [r3 + 16] ; topRow[8-15]
684
685 movd m3, [r2 + 32]
686 pshuflw m3, m3, 0
687 pshufd m3, m3, 0
688 movzx r4d, word [r3 + 32] ; topRight = above[16]
689
690 psubw m4, m3, m1 ; v_bottomRow[0]
691 psubw m3, m2 ; v_bottomRow[1]
692
693 psllw m1, 4
694 psllw m2, 4
695
696%macro PRED_PLANAR_ROW16 1
697 movzx r5d, word [r2 + %1 * 2]
698 add r5d, r5d
699 lea r5d, [r5d * 8 + 16]
700 movd m5, r5d
701 pshuflw m5, m5, 0
702 pshufd m5, m5, 0 ; horPred
703
704 movzx r5d, word [r2 + %1 * 2]
705 mov r3d, r4d
706 sub r3d, r5d
707 movd m0, r3d
708 pshuflw m0, m0, 0
709 pshufd m0, m0, 0
710
711 pmullw m6, m0, [multiL]
712 paddw m6, m5
713 paddw m1, m4
714 paddw m6, m1
715 psraw m6, 5
716
717 pmullw m0, m0, [multiH]
718 paddw m5, m0
719 paddw m2, m3
720 paddw m5, m2
721 psraw m5, 5
722
723 movu [r0], m6
724 movu [r0 + 16], m5
725 add r0, r1
726%endmacro
727
728 PRED_PLANAR_ROW16 0
729 PRED_PLANAR_ROW16 1
730 PRED_PLANAR_ROW16 2
731 PRED_PLANAR_ROW16 3
732 PRED_PLANAR_ROW16 4
733 PRED_PLANAR_ROW16 5
734 PRED_PLANAR_ROW16 6
735 PRED_PLANAR_ROW16 7
736 PRED_PLANAR_ROW16 8
737 PRED_PLANAR_ROW16 9
738 PRED_PLANAR_ROW16 10
739 PRED_PLANAR_ROW16 11
740 PRED_PLANAR_ROW16 12
741 PRED_PLANAR_ROW16 13
742 PRED_PLANAR_ROW16 14
743 PRED_PLANAR_ROW16 15
744%undef PRED_PLANAR_ROW16
745 RET
746%endif
747
748;-----------------------------------------------------------------------------------------------------------
749; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
750;-----------------------------------------------------------------------------------------------------------
751INIT_XMM sse4
752%if (ARCH_X86_64 == 1)
753cglobal intra_pred_planar32, 4,7,8+8, 0-4*mmsize
754 %define bottomRow0 m8
755 %define bottomRow1 m9
756 %define bottomRow2 m10
757 %define bottomRow3 m11
758 %define bottomRow4 m12
759 %define bottomRow5 m13
760 %define bottomRow6 m14
761 %define bottomRow7 m15
762 %define tmp0 [rsp + 0*mmsize]
763 %define tmp1 [rsp + 1*mmsize]
764 %define tmp2 [rsp + 2*mmsize]
765 %define tmp3 [rsp + 3*mmsize]
766%else
767cglobal intra_pred_planar32, 4,7,8, 0-12*mmsize
768 %define bottomRow0 [rsp + 0*mmsize]
769 %define bottomRow1 [rsp + 1*mmsize]
770 %define bottomRow2 [rsp + 2*mmsize]
771 %define bottomRow3 [rsp + 3*mmsize]
772 %define bottomRow4 [rsp + 4*mmsize]
773 %define bottomRow5 [rsp + 5*mmsize]
774 %define bottomRow6 [rsp + 6*mmsize]
775 %define bottomRow7 [rsp + 7*mmsize]
776 %define tmp0 [rsp + 8*mmsize]
777 %define tmp1 [rsp + 9*mmsize]
778 %define tmp2 [rsp + 10*mmsize]
779 %define tmp3 [rsp + 11*mmsize]
780%endif
781
782 add r2, 2
783 add r3, 2
784 add r1, r1
785
786 pxor m0, m0
787
788 ; bottomRow
789 movzx r4d, word [r2 + 32*2]
790 movd m1, r4d
791 pshufd m1, m1, 0 ; m1 = bottomLeft
792 movu m2, [r3]
793 pmovzxwd m3, m2
794 punpckhwd m2, m0
795 psubd m4, m1, m3
796 mova bottomRow0, m4
797 psubd m4, m1, m2
798 mova bottomRow1, m4
799 movu m2, [r3 + 16]
800 pmovzxwd m3, m2
801 punpckhwd m2, m0
802 psubd m4, m1, m3
803 mova bottomRow2, m4
804 psubd m4, m1, m2
805 mova bottomRow3, m4
806
807 movu m2, [r3 + 32]
808 pmovzxwd m3, m2
809 punpckhwd m2, m0
810 psubd m4, m1, m3
811 mova bottomRow4, m4
812 psubd m4, m1, m2
813 mova bottomRow5, m4
814 movu m2, [r3 + 48]
815 pmovzxwd m3, m2
816 punpckhwd m2, m0
817 psubd m4, m1, m3
818 mova bottomRow6, m4
819 psubd m1, m2
820 mova bottomRow7, m1
821
822 ; topRow
823 pmovzxwd m0, [r3 + 0*8]
824 pslld m0, 5
825 pmovzxwd m1, [r3 + 1*8]
826 pslld m1, 5
827 pmovzxwd m2, [r3 + 2*8]
828 pslld m2, 5
829 pmovzxwd m3, [r3 + 3*8]
830 pslld m3, 5
831
832 pmovzxwd m4, [r3 + 4*8]
833 pslld m4, 5
834 mova tmp0, m4
835 pmovzxwd m4, [r3 + 5*8]
836 pslld m4, 5
837 mova tmp1, m4
838 pmovzxwd m4, [r3 + 6*8]
839 pslld m4, 5
840 mova tmp2, m4
841 pmovzxwd m4, [r3 + 7*8]
842 pslld m4, 5
843 mova tmp3, m4
844
845 xor r6, r6
846.loopH:
847 movzx r4d, word [r2 + r6*2]
848 movzx r5d, word [r3 + 32*2] ; r5 = topRight
849 sub r5d, r4d
850 movd m5, r5d
851 pshuflw m5, m5, 0
852 pmullw m5, [multiL]
853 pmovsxwd m5, m5 ; m5 = rightCol
854 shl r4d, 5
855 add r4d, 32
856 movd m4, r4d
857 pshufd m4, m4, 0 ; m4 = horPred
858 paddd m4, m5
859 pshufd m6, m5, 0xFF ; m6 = [4 4 4 4]
860
861 ; 0-3
862 paddd m0, bottomRow0
863 paddd m5, m0, m4
864 psrad m5, 6
865 packusdw m5, m5
866 movh [r0 + 0*8], m5
867
868 ; 4-7
869 paddd m4, m6
870 paddd m1, bottomRow1
871 paddd m5, m1, m4
872 psrad m5, 6
873 packusdw m5, m5
874 movh [r0 + 1*8], m5
875
876 ; 8-11
877 paddd m4, m6
878 paddd m2, bottomRow2
879 paddd m5, m2, m4
880 psrad m5, 6
881 packusdw m5, m5
882 movh [r0 + 2*8], m5
883
884 ; 12-15
885 paddd m4, m6
886 paddd m3, bottomRow3
887 paddd m5, m3, m4
888 psrad m5, 6
889 packusdw m5, m5
890 movh [r0 + 3*8], m5
891
892 ; 16-19
893 paddd m4, m6
894 mova m7, tmp0
895 paddd m7, bottomRow4
896 mova tmp0, m7
897 paddd m7, m4
898 psrad m7, 6
899 packusdw m7, m7
900 movh [r0 + 4*8], m7
901
902 ; 20-23
903 paddd m4, m6
904 mova m7, tmp1
905 paddd m7, bottomRow5
906 mova tmp1, m7
907 paddd m7, m4
908 psrad m7, 6
909 packusdw m7, m7
910 movh [r0 + 5*8], m7
911
912 ; 24-27
913 paddd m4, m6
914 mova m7, tmp2
915 paddd m7, bottomRow6
916 mova tmp2, m7
917 paddd m7, m4
918 psrad m7, 6
919 packusdw m7, m7
920 movh [r0 + 6*8], m7
921
922 ; 28-31
923 paddd m4, m6
924 mova m7, tmp3
925 paddd m7, bottomRow7
926 mova tmp3, m7
927 paddd m7, m4
928 psrad m7, 6
929 packusdw m7, m7
930 movh [r0 + 7*8], m7
931
932 add r0, r1
933 inc r6d
934 cmp r6d, 32
935 jnz .loopH
936
937 RET
938
939;-----------------------------------------------------------------------------
940; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
941;-----------------------------------------------------------------------------
942INIT_XMM ssse3
943cglobal intra_pred_ang4_2, 3,3,4
944 cmp r4m, byte 34
945 cmove r2, r3mp
946 add r1, r1
947 movu m0, [r2 + 4]
948 movh [r0], m0
949 palignr m1, m0, 2
950 movh [r0 + r1], m1
951 palignr m2, m0, 4
952 movh [r0 + r1 * 2], m2
953 lea r1, [r1 * 3]
954 psrldq m0, 6
955 movh [r0 + r1], m0
956 RET
957
958INIT_XMM sse4
959cglobal intra_pred_ang4_3, 3,4,8
960 cmp r4m, byte 33
961 cmove r2, r3mp
962 lea r3, [ang_table + 20 * 16]
963 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
964 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
965 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
966 palignr m5, m0, 4 ; [x x 8 7 6 5 4 3]
967 punpcklwd m3, m1, m5 ; [6 5 5 4 4 3 3 2]
968 palignr m1, m0, 6 ; [x x x 8 7 6 5 4]
969 punpcklwd m4, m5 ,m1 ; [7 6 6 5 5 4 4 3]
970 movhlps m0, m0 ; [x x x x 8 7 6 5]
971 punpcklwd m5, m1, m0 ; [8 7 7 6 6 5 5 4]
972
973 mova m0, [r3 + 6 * 16] ; [26]
974 mova m1, [r3] ; [20]
975 mova m6, [r3 - 6 * 16] ; [14]
976 mova m7, [r3 - 12 * 16] ; [ 8]
977 jmp .do_filter4x4
978
979ALIGN 16
980.do_filter4x4:
981 pmaddwd m2, m0
982 paddd m2, [pd_16]
983 psrld m2, 5
984
985 pmaddwd m3, m1
986 paddd m3, [pd_16]
987 psrld m3, 5
988 packusdw m2, m3
989
990 pmaddwd m4, m6
991 paddd m4, [pd_16]
992 psrld m4, 5
993
994 pmaddwd m5, m7
995 paddd m5, [pd_16]
996 psrld m5, 5
997 packusdw m4, m5
998
999 jz .store
1000
1001 ; transpose 4x4
1002 punpckhwd m0, m2, m4
1003 punpcklwd m2, m4
1004 punpckhwd m4, m2, m0
1005 punpcklwd m2, m0
1006
1007.store:
1008 add r1, r1
1009 movh [r0], m2
1010 movhps [r0 + r1], m2
1011 movh [r0 + r1 * 2], m4
1012 lea r1, [r1 * 3]
1013 movhps [r0 + r1], m4
1014 RET
1015
1016cglobal intra_pred_ang4_4, 3,4,8
1017 cmp r4m, byte 32
1018 cmove r2, r3mp
1019 lea r3, [ang_table + 18 * 16]
1020 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1021 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1022 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1023 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
1024 punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2]
1025 mova m4, m3
1026 palignr m7, m0, 6 ; [x x x 8 7 6 5 4]
1027 punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3]
1028
1029 mova m0, [r3 + 3 * 16] ; [21]
1030 mova m1, [r3 - 8 * 16] ; [10]
1031 mova m6, [r3 + 13 * 16] ; [31]
1032 mova m7, [r3 + 2 * 16] ; [20]
1033 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1034
1035cglobal intra_pred_ang4_5, 3,4,8
1036 cmp r4m, byte 31
1037 cmove r2, r3mp
1038 lea r3, [ang_table + 10 * 16]
1039 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1040 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1041 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1042 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
1043 punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2]
1044 mova m4, m3
1045 palignr m7, m0, 6 ; [x x x 8 7 6 5 4]
1046 punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3]
1047
1048 mova m0, [r3 + 7 * 16] ; [17]
1049 mova m1, [r3 - 8 * 16] ; [ 2]
1050 mova m6, [r3 + 9 * 16] ; [19]
1051 mova m7, [r3 - 6 * 16] ; [ 4]
1052 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1053
1054cglobal intra_pred_ang4_6, 3,4,8
1055 cmp r4m, byte 30
1056 cmove r2, r3mp
1057 lea r3, [ang_table + 19 * 16]
1058 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1059 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1060 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1061 mova m3, m2
1062 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
1063 punpcklwd m4, m1, m6 ; [6 5 5 4 4 3 3 2]
1064 mova m5, m4
1065
1066 mova m0, [r3 - 6 * 16] ; [13]
1067 mova m1, [r3 + 7 * 16] ; [26]
1068 mova m6, [r3 - 12 * 16] ; [ 7]
1069 mova m7, [r3 + 1 * 16] ; [20]
1070 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1071
1072cglobal intra_pred_ang4_7, 3,4,8
1073 cmp r4m, byte 29
1074 cmove r2, r3mp
1075 lea r3, [ang_table + 20 * 16]
1076 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1077 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1078 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1079 mova m3, m2
1080 mova m4, m2
1081 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
1082 punpcklwd m5, m1, m6 ; [6 5 5 4 4 3 3 2]
1083
1084 mova m0, [r3 - 11 * 16] ; [ 9]
1085 mova m1, [r3 - 2 * 16] ; [18]
1086 mova m6, [r3 + 7 * 16] ; [27]
1087 mova m7, [r3 - 16 * 16] ; [ 4]
1088 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1089
1090cglobal intra_pred_ang4_8, 3,4,8
1091 cmp r4m, byte 28
1092 cmove r2, r3mp
1093 lea r3, [ang_table + 13 * 16]
1094 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1095 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1096 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1097 mova m3, m2
1098 mova m4, m2
1099 mova m5, m2
1100
1101 mova m0, [r3 - 8 * 16] ; [ 5]
1102 mova m1, [r3 - 3 * 16] ; [10]
1103 mova m6, [r3 + 2 * 16] ; [15]
1104 mova m7, [r3 + 7 * 16] ; [20]
1105 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1106
1107
1108cglobal intra_pred_ang4_9, 3,4,8
1109 cmp r4m, byte 27
1110 cmove r2, r3mp
1111 lea r3, [ang_table + 4 * 16]
1112 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1113 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1114 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1115 mova m3, m2
1116 mova m4, m2
1117 mova m5, m2
1118
1119 mova m0, [r3 - 2 * 16] ; [ 2]
1120 mova m1, [r3 - 0 * 16] ; [ 4]
1121 mova m6, [r3 + 2 * 16] ; [ 6]
1122 mova m7, [r3 + 4 * 16] ; [ 8]
1123 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1124
1125cglobal intra_pred_ang4_10, 3,3,4
1126 movh m0, [r2 + 2] ; [4 3 2 1]
1127 pshufb m2, m0, [pb_unpackwq2] ; [4 4 4 4 3 3 3 3]
1128 pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1]
1129 add r1, r1
1130 movhlps m1, m0 ; [2 2 2 2]
1131 movhlps m3, m2 ; [4 4 4 4]
1132 movh [r0 + r1], m1
1133 movh [r0 + r1 * 2], m2
1134 lea r1, [r1 * 3]
1135 movh [r0 + r1], m3
1136
1137 cmp r5m, byte 0
1138 jz .quit
1139
1140 ; filter
1141 mov r2, r3mp
1142 movu m1, [r2] ; [7 6 5 4 3 2 1 0]
1143 pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0]
1144 palignr m1, m1, 2 ; [4 3 2 1]
1145 psubw m1, m2
1146 psraw m1, 1
1147 paddw m0, m1
1148 pxor m1, m1
1149 pmaxsw m0, m1
1150 pminsw m0, [pw_1023]
1151
1152.quit:
1153 movh [r0], m0
1154 RET
1155
1156cglobal intra_pred_ang4_26, 4,4,3
1157 movh m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
1158 add r1, r1
1159 ; store
1160 movh [r0], m0
1161 movh [r0 + r1], m0
1162 movh [r0 + r1 * 2], m0
1163 lea r3, [r1 * 3]
1164 movh [r0 + r3], m0
1165
1166 ; filter
1167 cmp r5m, byte 0
1168 jz .quit
1169
1170 pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1]
1171 movu m1, [r2] ; [7 6 5 4 3 2 1 0]
1172 pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0]
1173 palignr m1, m1, 2 ; [4 3 2 1]
1174 psubw m1, m2
1175 psraw m1, 1
1176 paddw m0, m1
1177 pxor m1, m1
1178 pmaxsw m0, m1
1179 pminsw m0, [pw_1023]
1180
1181 pextrw [r0], m0, 0
1182 pextrw [r0 + r1], m0, 1
1183 pextrw [r0 + r1 * 2], m0, 2
1184 pextrw [r0 + r3], m0, 3
1185
1186.quit:
1187 RET
1188
1189cglobal intra_pred_ang4_11, 3,4,8
1190 cmp r4m, byte 25
1191 cmove r2, r3mp
1192 lea r3, [ang_table + 24 * 16]
1193 movu m2, [r2] ; [x x x 4 3 2 1 0]
1194 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1195 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
1196 mova m3, m2
1197 mova m4, m2
1198 mova m5, m2
1199
1200 mova m0, [r3 + 6 * 16] ; [24]
1201 mova m1, [r3 + 4 * 16] ; [26]
1202 mova m6, [r3 + 2 * 16] ; [28]
1203 mova m7, [r3 + 0 * 16] ; [30]
1204 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1205
1206
1207cglobal intra_pred_ang4_12, 3,4,8
1208 cmp r4m, byte 24
1209 cmove r2, r3mp
1210 lea r3, [ang_table + 20 * 16]
1211 movu m2, [r2] ; [x x x 4 3 2 1 0]
1212 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1213 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
1214 mova m3, m2
1215 mova m4, m2
1216 mova m5, m2
1217
1218 mova m0, [r3 + 7 * 16] ; [27]
1219 mova m1, [r3 + 2 * 16] ; [22]
1220 mova m6, [r3 - 3 * 16] ; [17]
1221 mova m7, [r3 - 8 * 16] ; [12]
1222 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1223
1224
1225cglobal intra_pred_ang4_13, 4,4,8
1226 cmp r4m, byte 23
1227 jnz .load
1228 xchg r2, r3
1229.load:
1230 movu m5, [r2 - 2] ; [x x 4 3 2 1 0 x]
1231 palignr m2, m5, 2 ; [x x x 4 3 2 1 0]
1232 palignr m0, m5, 4 ; [x x x x 4 3 2 1]
1233 pinsrw m5, [r3 + 8], 0
1234 punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
1235 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
1236 mova m3, m2
1237 mova m4, m2
1238
1239 lea r3, [ang_table + 21 * 16]
1240 mova m0, [r3 + 2 * 16] ; [23]
1241 mova m1, [r3 - 7 * 16] ; [14]
1242 mova m6, [r3 - 16 * 16] ; [ 5]
1243 mova m7, [r3 + 7 * 16] ; [28]
1244 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1245
1246cglobal intra_pred_ang4_14, 4,4,8
1247 cmp r4m, byte 22
1248 jnz .load
1249 xchg r2, r3
1250.load:
1251 movu m5, [r2 - 2] ; [x x 4 3 2 1 0 x]
1252 palignr m2, m5, 2 ; [x x x 4 3 2 1 0]
1253 palignr m0, m5, 4 ; [x x x x 4 3 2 1]
1254 pinsrw m5, [r3 + 4], 0
1255 punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
1256 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
1257 mova m3, m2
1258 mova m4, m5
1259
1260 lea r3, [ang_table + 19 * 16]
1261 mova m0, [r3 + 0 * 16] ; [19]
1262 mova m1, [r3 - 13 * 16] ; [ 6]
1263 mova m6, [r3 + 6 * 16] ; [25]
1264 mova m7, [r3 - 7 * 16] ; [12]
1265 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1266
1267
1268cglobal intra_pred_ang4_15, 4,4,8
1269 cmp r4m, byte 21
1270 jnz .load
1271 xchg r2, r3
1272.load:
1273 movu m3, [r2 - 2] ; [x x 4 3 2 1 0 x]
1274 palignr m2, m3, 2 ; [x x x 4 3 2 1 0]
1275 palignr m0, m3, 4 ; [x x x x 4 3 2 1]
1276 pinsrw m3, [r3 + 4], 0
1277 pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y]
1278 pinsrw m5, [r3 + 8], 0
1279 punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
1280 punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
1281 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
1282 mova m4, m3
1283
1284 lea r3, [ang_table + 23 * 16]
1285 mova m0, [r3 - 8 * 16] ; [15]
1286 mova m1, [r3 + 7 * 16] ; [30]
1287 mova m6, [r3 - 10 * 16] ; [13]
1288 mova m7, [r3 + 5 * 16] ; [28]
1289 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1290
1291
1292cglobal intra_pred_ang4_16, 4,4,8
1293 cmp r4m, byte 20
1294 jnz .load
1295 xchg r2, r3
1296.load:
1297 movu m3, [r2 - 2] ; [x x 4 3 2 1 0 x]
1298 palignr m2, m3, 2 ; [x x x 4 3 2 1 0]
1299 palignr m0, m3, 4 ; [x x x x 4 3 2 1]
1300 pinsrw m3, [r3 + 4], 0
1301 pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y]
1302 pinsrw m5, [r3 + 6], 0
1303 punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
1304 punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
1305 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
1306 mova m4, m3
1307
1308 lea r3, [ang_table + 19 * 16]
1309 mova m0, [r3 - 8 * 16] ; [11]
1310 mova m1, [r3 + 3 * 16] ; [22]
1311 mova m6, [r3 - 18 * 16] ; [ 1]
1312 mova m7, [r3 - 7 * 16] ; [12]
1313 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1314
1315cglobal intra_pred_ang4_17, 4,4,8
1316 cmp r4m, byte 19
1317 jnz .load
1318 xchg r2, r3
1319.load:
1320 movu m6, [r2 - 2] ; [- - 4 3 2 1 0 x]
1321 palignr m2, m6, 2 ; [- - - 4 3 2 1 0]
1322 palignr m1, m6, 4 ; [- - - - 4 3 2 1]
1323 mova m4, m2
1324 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
1325
1326 pinsrw m6, [r3 + 2], 0
1327 punpcklwd m3, m6, m4 ; [3 2 2 1 1 0 0 x]
1328
1329 pslldq m4, m6, 2 ; [- 4 3 2 1 0 x y]
1330 pinsrw m4, [r3 + 4], 0
1331 pslldq m5, m4, 2 ; [4 3 2 1 0 x y z]
1332 pinsrw m5, [r3 + 8], 0
1333 punpcklwd m5, m4 ; [1 0 0 x x y y z]
1334 punpcklwd m4, m6 ; [2 1 1 0 0 x x y]
1335
1336 lea r3, [ang_table + 14 * 16]
1337 mova m0, [r3 - 8 * 16] ; [ 6]
1338 mova m1, [r3 - 2 * 16] ; [12]
1339 mova m6, [r3 + 4 * 16] ; [18]
1340 mova m7, [r3 + 10 * 16] ; [24]
1341 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1342
1343
1344cglobal intra_pred_ang4_18, 4,4,1
1345 movh m0, [r2]
1346 pshufb m0, [pw_swap]
1347 movhps m0, [r3 + 2]
1348 add r1, r1
1349 lea r2, [r1 * 3]
1350 movh [r0 + r2], m0
1351 psrldq m0, 2
1352 movh [r0 + r1 * 2], m0
1353 psrldq m0, 2
1354 movh [r0 + r1], m0
1355 psrldq m0, 2
1356 movh [r0], m0
1357 RET
1358
1359;-----------------------------------------------------------------------------
1360; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
1361;-----------------------------------------------------------------------------
1362INIT_XMM ssse3
1363cglobal intra_pred_ang8_2, 3,4,3
1364 cmp r4m, byte 34
1365 cmove r2, r3mp
1366 add r1, r1
1367 lea r3, [r1 * 3]
1368 movu m0, [r2 + 4]
1369 movu m1, [r2 + 20]
1370 movu [r0], m0
1371 palignr m2, m1, m0, 2
1372 movu [r0 + r1], m2
1373 palignr m2, m1, m0, 4
1374 movu [r0 + r1 * 2], m2
1375 palignr m2, m1, m0, 6
1376 movu [r0 + r3], m2
1377 lea r0, [r0 + r1 * 4]
1378 palignr m2, m1, m0, 8
1379 movu [r0], m2
1380 palignr m2, m1, m0, 10
1381 movu [r0 + r1], m2
1382 palignr m2, m1, m0, 12
1383 movu [r0 + r1 * 2], m2
1384 palignr m1, m0, 14
1385 movu [r0 + r3], m1
1386 RET
1387
1388INIT_XMM sse4
1389cglobal intra_pred_ang8_3, 3,5,8
1390 lea r3, [ang_table + 14 * 16]
1391 add r1, r1
1392
1393 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1394 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
1395 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
1396 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
1397
1398 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
1399 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
1400 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
1401 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
1402
1403 mova m4, m3
1404 pmaddwd m4, [r3 + 12 * 16] ; [26]
1405 paddd m4, [pd_16]
1406 psrld m4, 5
1407 mova m2, m0
1408 pmaddwd m2, [r3 + 12 * 16]
1409 paddd m2, [pd_16]
1410 psrld m2, 5
1411 packusdw m4, m2
1412
1413 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
1414 pmaddwd m2, [r3 + 6 * 16] ; [20]
1415 paddd m2, [pd_16]
1416 psrld m2, 5
1417 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1418 pmaddwd m6, [r3 + 6 * 16]
1419 paddd m6, [pd_16]
1420 psrld m6, 5
1421 packusdw m2, m6
1422
1423 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
1424 pmaddwd m6, [r3] ; [14]
1425 paddd m6, [pd_16]
1426 psrld m6, 5
1427 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1428 pmaddwd m7, [r3]
1429 paddd m7, [pd_16]
1430 psrld m7, 5
1431 packusdw m6, m7
1432
1433 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
1434 pmaddwd m7, [r3 - 6 * 16] ; [ 8]
1435 paddd m7, [pd_16]
1436 psrld m7, 5
1437 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
1438 pmaddwd m3, [r3 - 6 * 16]
1439 paddd m3, [pd_16]
1440 psrld m3, 5
1441 packusdw m7, m3
1442
1443 punpckhwd m3, m4, m2
1444 punpcklwd m4, m2
1445 punpckhwd m2, m6, m7
1446 punpcklwd m6, m7
1447
1448 punpckldq m7, m4, m6
1449 punpckhdq m4, m6
1450 punpckldq m6, m3, m2
1451 punpckhdq m3, m2
1452
1453 lea r4, [r1 * 3]
1454 movh [r0], m7
1455 movhps [r0 + r1], m7
1456 movh [r0 + r1 * 2], m4
1457 movhps [r0 + r4], m4
1458 lea r2, [r0 + r1 * 4]
1459 movh [r2], m6
1460 movhps [r2 + r1], m6
1461 movh [r2 + r1 * 2], m3
1462 movhps [r2 + r4], m3
1463
1464 mova m4, m0
1465 pmaddwd m4, [r3 - 12 * 16] ; [ 2]
1466 paddd m4, [pd_16]
1467 psrld m4, 5
1468 mova m2, m5
1469 pmaddwd m2, [r3 - 12 * 16]
1470 paddd m2, [pd_16]
1471 psrld m2, 5
1472 packusdw m4, m2
1473
1474 mova m2, m0
1475 pmaddwd m2, [r3 + 14 * 16] ; [28]
1476 paddd m2, [pd_16]
1477 psrld m2, 5
1478 mova m6, m5
1479 pmaddwd m6, [r3 + 14 * 16]
1480 paddd m6, [pd_16]
1481 psrld m6, 5
1482 packusdw m2, m6
1483
1484 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1485 pmaddwd m6, [r3 + 8 * 16] ; [22]
1486 paddd m6, [pd_16]
1487 psrld m6, 5
1488 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
1489 pmaddwd m7, [r3 + 8 * 16]
1490 paddd m7, [pd_16]
1491 psrld m7, 5
1492 packusdw m6, m7
1493
1494 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1495 pmaddwd m7, [r3 + 2 * 16] ; [16]
1496 paddd m7, [pd_16]
1497 psrld m7, 5
1498 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
1499 pmaddwd m1, [r3 + 2 * 16]
1500 paddd m1, [pd_16]
1501 psrld m1, 5
1502 packusdw m7, m1
1503
1504 punpckhwd m3, m4, m2
1505 punpcklwd m4, m2
1506 punpckhwd m2, m6, m7
1507 punpcklwd m6, m7
1508
1509 punpckldq m7, m4, m6
1510 punpckhdq m4, m6
1511 punpckldq m6, m3, m2
1512 punpckhdq m3, m2
1513
1514 movh [r0 + 8], m7
1515 movhps [r0 + r1 + 8], m7
1516 movh [r0 + r1 * 2 + 8], m4
1517 movhps [r0 + r4 + 8], m4
1518 lea r0, [r0 + r1 * 4]
1519 movh [r0 + 8], m6
1520 movhps [r0 + r1 + 8], m6
1521 movh [r0 + r1 * 2 + 8], m3
1522 movhps [r0 + r4 + 8], m3
1523
1524 RET
1525
1526cglobal intra_pred_ang8_4, 3,6,8
1527 lea r3, [ang_table + 19 * 16]
1528 add r1, r1
1529
1530 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1531 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
1532 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
1533 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
1534
1535 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
1536 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
1537 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
1538
1539 mova m4, m3
1540 pmaddwd m4, [r3 + 2 * 16] ; [21]
1541 paddd m4, [pd_16]
1542 psrld m4, 5
1543 mova m2, m0
1544 pmaddwd m2, [r3 + 2 * 16]
1545 paddd m2, [pd_16]
1546 psrld m2, 5
1547 packusdw m4, m2
1548
1549 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
1550 mova m6, m2
1551 pmaddwd m2, [r3 - 9 * 16] ; [10]
1552 paddd m2, [pd_16]
1553 psrld m2, 5
1554 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1555 mova m7, m1
1556 pmaddwd m1, [r3 - 9 * 16]
1557 paddd m1, [pd_16]
1558 psrld m1, 5
1559 packusdw m2, m1
1560
1561 pmaddwd m6, [r3 + 12 * 16] ; [31]
1562 paddd m6, [pd_16]
1563 psrld m6, 5
1564 pmaddwd m7, [r3 + 12 * 16]
1565 paddd m7, [pd_16]
1566 psrld m7, 5
1567 packusdw m6, m7
1568
1569 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
1570 pmaddwd m7, [r3 + 1 * 16] ; [20]
1571 paddd m7, [pd_16]
1572 psrld m7, 5
1573 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1574 pmaddwd m1, [r3 + 1 * 16]
1575 paddd m1, [pd_16]
1576 psrld m1, 5
1577 packusdw m7, m1
1578
1579 punpckhwd m1, m4, m2
1580 punpcklwd m4, m2
1581 punpckhwd m2, m6, m7
1582 punpcklwd m6, m7
1583
1584 punpckldq m7, m4, m6
1585 punpckhdq m4, m6
1586 punpckldq m6, m1, m2
1587 punpckhdq m1, m2
1588
1589 lea r4, [r1 * 3]
1590 movh [r0], m7
1591 movhps [r0 + r1], m7
1592 movh [r0 + r1 * 2], m4
1593 movhps [r0 + r4], m4
1594 lea r5, [r0 + r1 * 4]
1595 movh [r5], m6
1596 movhps [r5 + r1], m6
1597 movh [r5 + r1 * 2], m1
1598 movhps [r5 + r4], m1
1599
1600 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
1601 mova m2, m4
1602 pmaddwd m4, [r3 - 10 * 16] ; [ 9]
1603 paddd m4, [pd_16]
1604 psrld m4, 5
1605 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
1606 mova m6, m3
1607 pmaddwd m3, [r3 - 10 * 16]
1608 paddd m3, [pd_16]
1609 psrld m3, 5
1610 packusdw m4, m3
1611
1612 pmaddwd m2, [r3 + 11 * 16] ; [30]
1613 paddd m2, [pd_16]
1614 psrld m2, 5
1615 pmaddwd m6, [r3 + 11 * 16]
1616 paddd m6, [pd_16]
1617 psrld m6, 5
1618 packusdw m2, m6
1619
1620 mova m6, m0
1621 pmaddwd m6, [r3] ; [19]
1622 paddd m6, [pd_16]
1623 psrld m6, 5
1624 mova m7, m5
1625 pmaddwd m7, [r3]
1626 paddd m7, [pd_16]
1627 psrld m7, 5
1628 packusdw m6, m7
1629
1630 movh m1, [r2 + 26] ; [16 15 14 13]
1631 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1632 pmaddwd m7, [r3 - 11 * 16] ; [8]
1633 paddd m7, [pd_16]
1634 psrld m7, 5
1635 palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10]
1636 pmaddwd m1, [r3 - 11 * 16]
1637 paddd m1, [pd_16]
1638 psrld m1, 5
1639 packusdw m7, m1
1640
1641 punpckhwd m3, m4, m2
1642 punpcklwd m4, m2
1643 punpckhwd m2, m6, m7
1644 punpcklwd m6, m7
1645
1646 punpckldq m7, m4, m6
1647 punpckhdq m4, m6
1648 punpckldq m6, m3, m2
1649 punpckhdq m3, m2
1650
1651 movh [r0 + 8], m7
1652 movhps [r0 + r1 + 8], m7
1653 movh [r0 + r1 * 2 + 8], m4
1654 movhps [r0 + r4 + 8], m4
1655 lea r0, [r0 + r1 * 4]
1656 movh [r0 + 8], m6
1657 movhps [r0 + r1 + 8], m6
1658 movh [r0 + r1 * 2 + 8], m3
1659 movhps [r0 + r4 + 8], m3
1660
1661 RET
1662
1663cglobal intra_pred_ang8_5, 3,5,8
1664 lea r3, [ang_table + 13 * 16]
1665 add r1, r1
1666
1667 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1668 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
1669 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
1670 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
1671
1672 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
1673 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
1674 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
1675
1676 mova m4, m3
1677 pmaddwd m4, [r3 + 4 * 16] ; [17]
1678 paddd m4, [pd_16]
1679 psrld m4, 5
1680 mova m2, m0
1681 pmaddwd m2, [r3 + 4 * 16]
1682 paddd m2, [pd_16]
1683 psrld m2, 5
1684 packusdw m4, m2
1685
1686 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
1687 mova m6, m2
1688 pmaddwd m2, [r3 - 11 * 16] ; [2]
1689 paddd m2, [pd_16]
1690 psrld m2, 5
1691 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1692 mova m7, m1
1693 pmaddwd m1, [r3 - 11 * 16]
1694 paddd m1, [pd_16]
1695 psrld m1, 5
1696 packusdw m2, m1
1697
1698 pmaddwd m6, [r3 + 6 * 16] ; [19]
1699 paddd m6, [pd_16]
1700 psrld m6, 5
1701 pmaddwd m7, [r3 + 6 * 16]
1702 paddd m7, [pd_16]
1703 psrld m7, 5
1704 packusdw m6, m7
1705
1706 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
1707 pmaddwd m7, [r3 - 9 * 16] ; [4]
1708 paddd m7, [pd_16]
1709 psrld m7, 5
1710 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1711 pmaddwd m1, [r3 - 9 * 16]
1712 paddd m1, [pd_16]
1713 psrld m1, 5
1714 packusdw m7, m1
1715
1716 punpckhwd m1, m4, m2
1717 punpcklwd m4, m2
1718 punpckhwd m2, m6, m7
1719 punpcklwd m6, m7
1720
1721 punpckldq m7, m4, m6
1722 punpckhdq m4, m6
1723 punpckldq m6, m1, m2
1724 punpckhdq m1, m2
1725
1726 lea r4, [r1 * 3]
1727 movh [r0], m7
1728 movhps [r0 + r1], m7
1729 movh [r0 + r1 * 2], m4
1730 movhps [r0 + r4], m4
1731 lea r2, [r0 + r1 * 4]
1732 movh [r2], m6
1733 movhps [r2 + r1], m6
1734 movh [r2 + r1 * 2], m1
1735 movhps [r2 + r4], m1
1736
1737 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
1738 pmaddwd m4, [r3 + 8 * 16] ; [21]
1739 paddd m4, [pd_16]
1740 psrld m4, 5
1741 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1742 pmaddwd m2, [r3 + 8 * 16]
1743 paddd m2, [pd_16]
1744 psrld m2, 5
1745 packusdw m4, m2
1746
1747 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
1748 mova m6, m2
1749 pmaddwd m2, [r3 - 7 * 16] ; [6]
1750 paddd m2, [pd_16]
1751 psrld m2, 5
1752 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
1753 mova m7, m1
1754 pmaddwd m1, [r3 - 7 * 16]
1755 paddd m1, [pd_16]
1756 psrld m1, 5
1757 packusdw m2, m1
1758
1759 pmaddwd m6, [r3 + 10 * 16] ; [23]
1760 paddd m6, [pd_16]
1761 psrld m6, 5
1762 pmaddwd m7, [r3 + 10 * 16]
1763 paddd m7, [pd_16]
1764 psrld m7, 5
1765 packusdw m6, m7
1766
1767 mova m7, m0
1768 pmaddwd m7, [r3 - 5 * 16] ; [8]
1769 paddd m7, [pd_16]
1770 psrld m7, 5
1771 mova m1, m5
1772 pmaddwd m1, [r3 - 5 * 16]
1773 paddd m1, [pd_16]
1774 psrld m1, 5
1775 packusdw m7, m1
1776
1777 punpckhwd m3, m4, m2
1778 punpcklwd m4, m2
1779 punpckhwd m2, m6, m7
1780 punpcklwd m6, m7
1781
1782 punpckldq m7, m4, m6
1783 punpckhdq m4, m6
1784 punpckldq m6, m3, m2
1785 punpckhdq m3, m2
1786
1787 movh [r0 + 8], m7
1788 movhps [r0 + r1 + 8], m7
1789 movh [r0 + r1 * 2 + 8], m4
1790 movhps [r0 + r4 + 8], m4
1791 lea r0, [r0 + r1 * 4]
1792 movh [r0 + 8], m6
1793 movhps [r0 + r1 + 8], m6
1794 movh [r0 + r1 * 2 + 8], m3
1795 movhps [r0 + r4 + 8], m3
1796
1797 RET
1798
1799cglobal intra_pred_ang8_6, 3,5,8
1800 lea r3, [ang_table + 14 * 16]
1801 add r1, r1
1802
1803 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1804 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
1805 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
1806 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
1807
1808 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
1809 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
1810 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
1811
1812 mova m4, m3
1813 pmaddwd m4, [r3 - 1 * 16] ; [13]
1814 paddd m4, [pd_16]
1815 psrld m4, 5
1816 mova m2, m0
1817 pmaddwd m2, [r3 - 1 * 16]
1818 paddd m2, [pd_16]
1819 psrld m2, 5
1820 packusdw m4, m2
1821
1822 mova m2, m3
1823 pmaddwd m2, [r3 + 12 * 16] ; [26]
1824 paddd m2, [pd_16]
1825 psrld m2, 5
1826 mova m1, m0
1827 pmaddwd m1, [r3 + 12 * 16]
1828 paddd m1, [pd_16]
1829 psrld m1, 5
1830 packusdw m2, m1
1831
1832 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
1833 mova m7, m6
1834 pmaddwd m6, [r3 - 7 * 16] ; [7]
1835 paddd m6, [pd_16]
1836 psrld m6, 5
1837 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1838 pmaddwd m1, [r3 - 7 * 16]
1839 paddd m1, [pd_16]
1840 psrld m1, 5
1841 packusdw m6, m1
1842
1843 pmaddwd m7, [r3 + 6 * 16] ; [20]
1844 paddd m7, [pd_16]
1845 psrld m7, 5
1846 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1847 pmaddwd m1, [r3 + 6 * 16]
1848 paddd m1, [pd_16]
1849 psrld m1, 5
1850 packusdw m7, m1
1851
1852 punpckhwd m1, m4, m2
1853 punpcklwd m4, m2
1854 punpckhwd m2, m6, m7
1855 punpcklwd m6, m7
1856
1857 punpckldq m7, m4, m6
1858 punpckhdq m4, m6
1859 punpckldq m6, m1, m2
1860 punpckhdq m1, m2
1861
1862 lea r4, [r1 * 3]
1863 movh [r0], m7
1864 movhps [r0 + r1], m7
1865 movh [r0 + r1 * 2], m4
1866 movhps [r0 + r4], m4
1867 lea r2, [r0 + r1 * 4]
1868 movh [r2], m6
1869 movhps [r2 + r1], m6
1870 movh [r2 + r1 * 2], m1
1871 movhps [r2 + r4], m1
1872
1873 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
1874 mova m6, m4
1875 pmaddwd m4, [r3 - 13 * 16] ; [1]
1876 paddd m4, [pd_16]
1877 psrld m4, 5
1878 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1879 mova m7, m2
1880 pmaddwd m2, [r3 - 13 * 16]
1881 paddd m2, [pd_16]
1882 psrld m2, 5
1883 packusdw m4, m2
1884
1885 pmaddwd m2, m6, [r3] ; [14]
1886 paddd m2, [pd_16]
1887 psrld m2, 5
1888 pmaddwd m1, m7, [r3]
1889 paddd m1, [pd_16]
1890 psrld m1, 5
1891 packusdw m2, m1
1892
1893 pmaddwd m6, [r3 + 13 * 16] ; [27]
1894 paddd m6, [pd_16]
1895 psrld m6, 5
1896 pmaddwd m7, [r3 + 13 * 16]
1897 paddd m7, [pd_16]
1898 psrld m7, 5
1899 packusdw m6, m7
1900
1901 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
1902 pmaddwd m7, [r3 - 6 * 16] ; [8]
1903 paddd m7, [pd_16]
1904 psrld m7, 5
1905 palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8]
1906 pmaddwd m5, [r3 - 6 * 16]
1907 paddd m5, [pd_16]
1908 psrld m5, 5
1909 packusdw m7, m5
1910
1911 punpckhwd m3, m4, m2
1912 punpcklwd m4, m2
1913 punpckhwd m2, m6, m7
1914 punpcklwd m6, m7
1915
1916 punpckldq m7, m4, m6
1917 punpckhdq m4, m6
1918 punpckldq m6, m3, m2
1919 punpckhdq m3, m2
1920
1921 movh [r0 + 8], m7
1922 movhps [r0 + r1 + 8], m7
1923 movh [r0 + r1 * 2 + 8], m4
1924 movhps [r0 + r4 + 8], m4
1925 lea r0, [r0 + r1 * 4]
1926 movh [r0 + 8], m6
1927 movhps [r0 + r1 + 8], m6
1928 movh [r0 + r1 * 2 + 8], m3
1929 movhps [r0 + r4 + 8], m3
1930
1931 RET
1932
1933cglobal intra_pred_ang8_7, 3,5,8
1934 lea r3, [ang_table + 18 * 16]
1935 add r1, r1
1936
1937 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1938 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
1939 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
1940 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
1941
1942 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
1943 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
1944 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
1945
1946 mova m4, m3
1947 pmaddwd m4, [r3 - 9 * 16] ; [9]
1948 paddd m4, [pd_16]
1949 psrld m4, 5
1950 mova m2, m0
1951 pmaddwd m2, [r3 - 9 * 16]
1952 paddd m2, [pd_16]
1953 psrld m2, 5
1954 packusdw m4, m2
1955
1956 mova m2, m3
1957 pmaddwd m2, [r3] ; [18]
1958 paddd m2, [pd_16]
1959 psrld m2, 5
1960 mova m1, m0
1961 pmaddwd m1, [r3]
1962 paddd m1, [pd_16]
1963 psrld m1, 5
1964 packusdw m2, m1
1965
1966 mova m6, m3
1967 pmaddwd m6, [r3 + 9 * 16] ; [27]
1968 paddd m6, [pd_16]
1969 psrld m6, 5
1970 mova m1, m0
1971 pmaddwd m1, [r3 + 9 * 16]
1972 paddd m1, [pd_16]
1973 psrld m1, 5
1974 packusdw m6, m1
1975
1976 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
1977 pmaddwd m7, [r3 - 14 * 16] ; [4]
1978 paddd m7, [pd_16]
1979 psrld m7, 5
1980 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1981 pmaddwd m1, [r3 - 14 * 16]
1982 paddd m1, [pd_16]
1983 psrld m1, 5
1984 packusdw m7, m1
1985
1986 punpckhwd m1, m4, m2
1987 punpcklwd m4, m2
1988 punpckhwd m2, m6, m7
1989 punpcklwd m6, m7
1990
1991 punpckldq m7, m4, m6
1992 punpckhdq m4, m6
1993 punpckldq m6, m1, m2
1994 punpckhdq m1, m2
1995
1996 lea r4, [r1 * 3]
1997 movh [r0], m7
1998 movhps [r0 + r1], m7
1999 movh [r0 + r1 * 2], m4
2000 movhps [r0 + r4], m4
2001 lea r2, [r0 + r1 * 4]
2002 movh [r2], m6
2003 movhps [r2 + r1], m6
2004 movh [r2 + r1 * 2], m1
2005 movhps [r2 + r4], m1
2006
2007 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
2008 mova m6, m4
2009 pmaddwd m4, [r3 - 5 * 16] ; [13]
2010 paddd m4, [pd_16]
2011 psrld m4, 5
2012 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
2013 mova m7, m2
2014 pmaddwd m2, [r3 - 5 * 16]
2015 paddd m2, [pd_16]
2016 psrld m2, 5
2017 packusdw m4, m2
2018
2019 pmaddwd m2, m6, [r3 + 4 * 16] ; [22]
2020 paddd m2, [pd_16]
2021 psrld m2, 5
2022 pmaddwd m1, m7, [r3 + 4 * 16]
2023 paddd m1, [pd_16]
2024 psrld m1, 5
2025 packusdw m2, m1
2026
2027 pmaddwd m6, [r3 + 13 * 16] ; [31]
2028 paddd m6, [pd_16]
2029 psrld m6, 5
2030 pmaddwd m7, [r3 + 13 * 16]
2031 paddd m7, [pd_16]
2032 psrld m7, 5
2033 packusdw m6, m7
2034
2035 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
2036 pmaddwd m7, [r3 - 10 * 16] ; [8]
2037 paddd m7, [pd_16]
2038 psrld m7, 5
2039 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
2040 pmaddwd m5, [r3 - 10 * 16]
2041 paddd m5, [pd_16]
2042 psrld m5, 5
2043 packusdw m7, m5
2044
2045 punpckhwd m3, m4, m2
2046 punpcklwd m4, m2
2047 punpckhwd m2, m6, m7
2048 punpcklwd m6, m7
2049
2050 punpckldq m7, m4, m6
2051 punpckhdq m4, m6
2052 punpckldq m6, m3, m2
2053 punpckhdq m3, m2
2054
2055 movh [r0 + 8], m7
2056 movhps [r0 + r1 + 8], m7
2057 movh [r0 + r1 * 2 + 8], m4
2058 movhps [r0 + r4 + 8], m4
2059 lea r0, [r0 + r1 * 4]
2060 movh [r0 + 8], m6
2061 movhps [r0 + r1 + 8], m6
2062 movh [r0 + r1 * 2 + 8], m3
2063 movhps [r0 + r4 + 8], m3
2064
2065 RET
2066
2067cglobal intra_pred_ang8_8, 3,6,7
2068 lea r3, [ang_table + 17 * 16]
2069 add r1, r1
2070
2071 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2072 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
2073
2074 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
2075 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
2076
2077 mova m4, m3
2078 pmaddwd m4, [r3 - 12 * 16] ; [5]
2079 paddd m4, [pd_16]
2080 psrld m4, 5
2081 mova m2, m0
2082 pmaddwd m2, [r3 - 12 * 16]
2083 paddd m2, [pd_16]
2084 psrld m2, 5
2085 packusdw m4, m2
2086
2087 mova m2, m3
2088 pmaddwd m2, [r3 - 7 * 16] ; [10]
2089 paddd m2, [pd_16]
2090 psrld m2, 5
2091 mova m1, m0
2092 pmaddwd m1, [r3 - 7 * 16]
2093 paddd m1, [pd_16]
2094 psrld m1, 5
2095 packusdw m2, m1
2096
2097 mova m6, m3
2098 pmaddwd m6, [r3 - 2 * 16] ; [15]
2099 paddd m6, [pd_16]
2100 psrld m6, 5
2101 mova m1, m0
2102 pmaddwd m1, [r3 - 2 * 16]
2103 paddd m1, [pd_16]
2104 psrld m1, 5
2105 packusdw m6, m1
2106
2107 mova m5, m3
2108 pmaddwd m5, [r3 + 3 * 16] ; [20]
2109 paddd m5, [pd_16]
2110 psrld m5, 5
2111 mova m1, m0
2112 pmaddwd m1, [r3 + 3 * 16]
2113 paddd m1, [pd_16]
2114 psrld m1, 5
2115 packusdw m5, m1
2116
2117 punpckhwd m1, m4, m2
2118 punpcklwd m4, m2
2119 punpckhwd m2, m6, m5
2120 punpcklwd m6, m5
2121
2122 punpckldq m5, m4, m6
2123 punpckhdq m4, m6
2124 punpckldq m6, m1, m2
2125 punpckhdq m1, m2
2126
2127 lea r4, [r1 * 3]
2128 movh [r0], m5
2129 movhps [r0 + r1], m5
2130 movh [r0 + r1 * 2], m4
2131 movhps [r0 + r4], m4
2132 lea r5, [r0 + r1 * 4]
2133 movh [r5], m6
2134 movhps [r5 + r1], m6
2135 movh [r5 + r1 * 2], m1
2136 movhps [r5 + r4], m1
2137
2138 mova m4, m3
2139 pmaddwd m4, [r3 + 8 * 16] ; [25]
2140 paddd m4, [pd_16]
2141 psrld m4, 5
2142 mova m2, m0
2143 pmaddwd m2, [r3 + 8 * 16]
2144 paddd m2, [pd_16]
2145 psrld m2, 5
2146 packusdw m4, m2
2147
2148 mova m2, m3
2149 pmaddwd m2, [r3 + 13 * 16] ; [30]
2150 paddd m2, [pd_16]
2151 psrld m2, 5
2152 mova m1, m0
2153 pmaddwd m1, [r3 + 13 * 16]
2154 paddd m1, [pd_16]
2155 psrld m1, 5
2156 packusdw m2, m1
2157
2158 movh m1, [r2 + 18] ; [12 11 10 9]
2159
2160 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
2161 mova m5, m6
2162 pmaddwd m6, [r3 - 14 * 16] ; [3]
2163 paddd m6, [pd_16]
2164 psrld m6, 5
2165 palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6]
2166 mova m3, m1
2167 pmaddwd m1, [r3 - 14 * 16]
2168 paddd m1, [pd_16]
2169 psrld m1, 5
2170 packusdw m6, m1
2171
2172 pmaddwd m5, [r3 - 9 * 16] ; [8]
2173 paddd m5, [pd_16]
2174 psrld m5, 5
2175 pmaddwd m3, [r3 - 9 * 16]
2176 paddd m3, [pd_16]
2177 psrld m3, 5
2178 packusdw m5, m3
2179
2180 punpckhwd m3, m4, m2
2181 punpcklwd m4, m2
2182 punpckhwd m2, m6, m5
2183 punpcklwd m6, m5
2184
2185 punpckldq m5, m4, m6
2186 punpckhdq m4, m6
2187 punpckldq m6, m3, m2
2188 punpckhdq m3, m2
2189
2190 movh [r0 + 8], m5
2191 movhps [r0 + r1 + 8], m5
2192 movh [r0 + r1 * 2 + 8], m4
2193 movhps [r0 + r4 + 8], m4
2194 lea r0, [r0 + r1 * 4]
2195 movh [r0 + 8], m6
2196 movhps [r0 + r1 + 8], m6
2197 movh [r0 + r1 * 2 + 8], m3
2198 movhps [r0 + r4 + 8], m3
2199
2200 RET
2201
2202cglobal intra_pred_ang8_9, 3,5,7
2203 lea r3, [ang_table + 9 * 16]
2204 add r1, r1
2205
2206 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2207 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
2208
2209 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
2210 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
2211
2212 mova m4, m3
2213 pmaddwd m4, [r3 - 7 * 16] ; [2]
2214 paddd m4, [pd_16]
2215 psrld m4, 5
2216 mova m2, m0
2217 pmaddwd m2, [r3 - 7 * 16]
2218 paddd m2, [pd_16]
2219 psrld m2, 5
2220 packusdw m4, m2
2221
2222 mova m2, m3
2223 pmaddwd m2, [r3 - 5 * 16] ; [4]
2224 paddd m2, [pd_16]
2225 psrld m2, 5
2226 mova m1, m0
2227 pmaddwd m1, [r3 - 5 * 16]
2228 paddd m1, [pd_16]
2229 psrld m1, 5
2230 packusdw m2, m1
2231
2232 mova m6, m3
2233 pmaddwd m6, [r3 - 3 * 16] ; [6]
2234 paddd m6, [pd_16]
2235 psrld m6, 5
2236 mova m1, m0
2237 pmaddwd m1, [r3 - 3 * 16]
2238 paddd m1, [pd_16]
2239 psrld m1, 5
2240 packusdw m6, m1
2241
2242 mova m5, m3
2243 pmaddwd m5, [r3 - 1 * 16] ; [8]
2244 paddd m5, [pd_16]
2245 psrld m5, 5
2246 mova m1, m0
2247 pmaddwd m1, [r3 - 1 * 16]
2248 paddd m1, [pd_16]
2249 psrld m1, 5
2250 packusdw m5, m1
2251
2252 punpckhwd m1, m4, m2
2253 punpcklwd m4, m2
2254 punpckhwd m2, m6, m5
2255 punpcklwd m6, m5
2256
2257 punpckldq m5, m4, m6
2258 punpckhdq m4, m6
2259 punpckldq m6, m1, m2
2260 punpckhdq m1, m2
2261
2262 lea r4, [r1 * 3]
2263 movh [r0], m5
2264 movhps [r0 + r1], m5
2265 movh [r0 + r1 * 2], m4
2266 movhps [r0 + r4], m4
2267 lea r2, [r0 + r1 * 4]
2268 movh [r2], m6
2269 movhps [r2 + r1], m6
2270 movh [r2 + r1 * 2], m1
2271 movhps [r2 + r4], m1
2272
2273 mova m4, m3
2274 pmaddwd m4, [r3 + 1 * 16] ; [10]
2275 paddd m4, [pd_16]
2276 psrld m4, 5
2277 mova m2, m0
2278 pmaddwd m2, [r3 + 1 * 16]
2279 paddd m2, [pd_16]
2280 psrld m2, 5
2281 packusdw m4, m2
2282
2283 mova m2, m3
2284 pmaddwd m2, [r3 + 3 * 16] ; [12]
2285 paddd m2, [pd_16]
2286 psrld m2, 5
2287 mova m1, m0
2288 pmaddwd m1, [r3 + 3 * 16]
2289 paddd m1, [pd_16]
2290 psrld m1, 5
2291 packusdw m2, m1
2292
2293 mova m6, m3
2294 pmaddwd m6, [r3 + 5 * 16] ; [14]
2295 paddd m6, [pd_16]
2296 psrld m6, 5
2297 mova m5, m0
2298 pmaddwd m5, [r3 + 5 * 16]
2299 paddd m5, [pd_16]
2300 psrld m5, 5
2301 packusdw m6, m5
2302
2303 pmaddwd m3, [r3 + 7 * 16] ; [16]
2304 paddd m3, [pd_16]
2305 psrld m3, 5
2306 pmaddwd m0, [r3 + 7 * 16]
2307 paddd m0, [pd_16]
2308 psrld m0, 5
2309 packusdw m3, m0
2310
2311 punpckhwd m5, m4, m2
2312 punpcklwd m4, m2
2313 punpckhwd m2, m6, m3
2314 punpcklwd m6, m3
2315
2316 punpckldq m3, m4, m6
2317 punpckhdq m4, m6
2318 punpckldq m6, m5, m2
2319 punpckhdq m5, m2
2320
2321 movh [r0 + 8], m3
2322 movhps [r0 + r1 + 8], m3
2323 movh [r0 + r1 * 2 + 8], m4
2324 movhps [r0 + r4 + 8], m4
2325 lea r0, [r0 + r1 * 4]
2326 movh [r0 + 8], m6
2327 movhps [r0 + r1 + 8], m6
2328 movh [r0 + r1 * 2 + 8], m5
2329 movhps [r0 + r4 + 8], m5
2330
2331 RET
2332
2333cglobal intra_pred_ang8_10, 4,5,3
2334 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2335 pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
2336 add r1, r1
2337 lea r4, [r1 * 3]
2338
2339 psrldq m1, 2
2340 pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
2341 movu [r0 + r1], m2
2342 psrldq m1, 2
2343 pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
2344 movu [r0 + r1 * 2], m2
2345 psrldq m1, 2
2346 pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
2347 movu [r0 + r4], m2
2348
2349 lea r2, [r0 + r1 *4]
2350 psrldq m1, 2
2351 pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
2352 movu [r2], m2
2353 psrldq m1, 2
2354 pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
2355 movu [r2 + r1], m2
2356 psrldq m1, 2
2357 pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
2358 movu [r2 + r1 * 2], m2
2359 psrldq m1, 2
2360 pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
2361 movu [r2 + r4], m2
2362
2363 cmp r5m, byte 0
2364 jz .quit
2365
2366 ; filter
2367
2368 movh m1, [r3] ; [3 2 1 0]
2369 pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
2370 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
2371 psubw m1, m2
2372 psraw m1, 1
2373 paddw m0, m1
2374 pxor m1, m1
2375 pmaxsw m0, m1
2376 pminsw m0, [pw_1023]
2377
2378.quit:
2379 movu [r0], m0
2380 RET
2381
2382cglobal intra_pred_ang8_11, 3,5,7
2383 lea r3, [ang_table + 23 * 16]
2384 add r1, r1
2385
2386 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
2387 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2388
2389 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
2390 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
2391
2392 mova m4, m3
2393 pmaddwd m4, [r3 + 7 * 16] ; [30]
2394 paddd m4, [pd_16]
2395 psrld m4, 5
2396 mova m2, m0
2397 pmaddwd m2, [r3 + 7 * 16]
2398 paddd m2, [pd_16]
2399 psrld m2, 5
2400 packusdw m4, m2
2401
2402 mova m2, m3
2403 pmaddwd m2, [r3 + 5 * 16] ; [28]
2404 paddd m2, [pd_16]
2405 psrld m2, 5
2406 mova m1, m0
2407 pmaddwd m1, [r3 + 5 * 16]
2408 paddd m1, [pd_16]
2409 psrld m1, 5
2410 packusdw m2, m1
2411
2412 mova m6, m3
2413 pmaddwd m6, [r3 + 3 * 16] ; [26]
2414 paddd m6, [pd_16]
2415 psrld m6, 5
2416 mova m1, m0
2417 pmaddwd m1, [r3 + 3 * 16]
2418 paddd m1, [pd_16]
2419 psrld m1, 5
2420 packusdw m6, m1
2421
2422 mova m5, m3
2423 pmaddwd m5, [r3 + 1 * 16] ; [24]
2424 paddd m5, [pd_16]
2425 psrld m5, 5
2426 mova m1, m0
2427 pmaddwd m1, [r3 + 1 * 16]
2428 paddd m1, [pd_16]
2429 psrld m1, 5
2430 packusdw m5, m1
2431
2432 punpckhwd m1, m4, m2
2433 punpcklwd m4, m2
2434 punpckhwd m2, m6, m5
2435 punpcklwd m6, m5
2436
2437 punpckldq m5, m4, m6
2438 punpckhdq m4, m6
2439 punpckldq m6, m1, m2
2440 punpckhdq m1, m2
2441
2442 lea r4, [r1 * 3]
2443 movh [r0], m5
2444 movhps [r0 + r1], m5
2445 movh [r0 + r1 * 2], m4
2446 movhps [r0 + r4], m4
2447 lea r2, [r0 + r1 * 4]
2448 movh [r2], m6
2449 movhps [r2 + r1], m6
2450 movh [r2 + r1 * 2], m1
2451 movhps [r2 + r4], m1
2452
2453 mova m4, m3
2454 pmaddwd m4, [r3 - 1 * 16] ; [22]
2455 paddd m4, [pd_16]
2456 psrld m4, 5
2457 mova m2, m0
2458 pmaddwd m2, [r3 - 1 * 16]
2459 paddd m2, [pd_16]
2460 psrld m2, 5
2461 packusdw m4, m2
2462
2463 mova m2, m3
2464 pmaddwd m2, [r3 - 3 * 16] ; [20]
2465 paddd m2, [pd_16]
2466 psrld m2, 5
2467 mova m1, m0
2468 pmaddwd m1, [r3 - 3 * 16]
2469 paddd m1, [pd_16]
2470 psrld m1, 5
2471 packusdw m2, m1
2472
2473 mova m6, m3
2474 pmaddwd m6, [r3 - 5 * 16] ; [18]
2475 paddd m6, [pd_16]
2476 psrld m6, 5
2477 mova m5, m0
2478 pmaddwd m5, [r3 - 5 * 16]
2479 paddd m5, [pd_16]
2480 psrld m5, 5
2481 packusdw m6, m5
2482
2483 pmaddwd m3, [r3 - 7 * 16] ; [16]
2484 paddd m3, [pd_16]
2485 psrld m3, 5
2486 pmaddwd m0, [r3 - 7 * 16]
2487 paddd m0, [pd_16]
2488 psrld m0, 5
2489 packusdw m3, m0
2490
2491 punpckhwd m5, m4, m2
2492 punpcklwd m4, m2
2493 punpckhwd m2, m6, m3
2494 punpcklwd m6, m3
2495
2496 punpckldq m3, m4, m6
2497 punpckhdq m4, m6
2498 punpckldq m6, m5, m2
2499 punpckhdq m5, m2
2500
2501 movh [r0 + 8], m3
2502 movhps [r0 + r1 + 8], m3
2503 movh [r0 + r1 * 2 + 8], m4
2504 movhps [r0 + r4 + 8], m4
2505 lea r0, [r0 + r1 * 4]
2506 movh [r0 + 8], m6
2507 movhps [r0 + r1 + 8], m6
2508 movh [r0 + r1 * 2 + 8], m5
2509 movhps [r0 + r4 + 8], m5
2510
2511 RET
2512
2513cglobal intra_pred_ang8_12, 4,6,7
2514 lea r5, [ang_table + 16 * 16]
2515 add r1, r1
2516
2517 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
2518 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2519
2520 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
2521 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
2522
2523 mova m4, m3
2524 pmaddwd m4, [r5 + 11 * 16] ; [27]
2525 paddd m4, [pd_16]
2526 psrld m4, 5
2527 mova m2, m0
2528 pmaddwd m2, [r5 + 11 * 16]
2529 paddd m2, [pd_16]
2530 psrld m2, 5
2531 packusdw m4, m2
2532
2533 mova m2, m3
2534 pmaddwd m2, [r5 + 6 * 16] ; [22]
2535 paddd m2, [pd_16]
2536 psrld m2, 5
2537 mova m1, m0
2538 pmaddwd m1, [r5 + 6 * 16]
2539 paddd m1, [pd_16]
2540 psrld m1, 5
2541 packusdw m2, m1
2542
2543 mova m6, m3
2544 pmaddwd m6, [r5 + 1 * 16] ; [17]
2545 paddd m6, [pd_16]
2546 psrld m6, 5
2547 mova m1, m0
2548 pmaddwd m1, [r5 + 1 * 16]
2549 paddd m1, [pd_16]
2550 psrld m1, 5
2551 packusdw m6, m1
2552
2553 mova m5, m3
2554 pmaddwd m5, [r5 - 4 * 16] ; [12]
2555 paddd m5, [pd_16]
2556 psrld m5, 5
2557 mova m1, m0
2558 pmaddwd m1, [r5 - 4 * 16]
2559 paddd m1, [pd_16]
2560 psrld m1, 5
2561 packusdw m5, m1
2562
2563 punpckhwd m1, m4, m2
2564 punpcklwd m4, m2
2565 punpckhwd m2, m6, m5
2566 punpcklwd m6, m5
2567
2568 punpckldq m5, m4, m6
2569 punpckhdq m4, m6
2570 punpckldq m6, m1, m2
2571 punpckhdq m1, m2
2572
2573 lea r4, [r1 * 3]
2574 movh [r0], m5
2575 movhps [r0 + r1], m5
2576 movh [r0 + r1 * 2], m4
2577 movhps [r0 + r4], m4
2578 lea r2, [r0 + r1 * 4]
2579 movh [r2], m6
2580 movhps [r2 + r1], m6
2581 movh [r2 + r1 * 2], m1
2582 movhps [r2 + r4], m1
2583
2584 mova m4, m3
2585 pmaddwd m4, [r5 - 9 * 16] ; [7]
2586 paddd m4, [pd_16]
2587 psrld m4, 5
2588 mova m2, m0
2589 pmaddwd m2, [r5 - 9 * 16]
2590 paddd m2, [pd_16]
2591 psrld m2, 5
2592 packusdw m4, m2
2593
2594 mova m2, m3
2595 pmaddwd m2, [r5 - 14 * 16] ; [2]
2596 paddd m2, [pd_16]
2597 psrld m2, 5
2598 mova m1, m0
2599 pmaddwd m1, [r5 - 14 * 16]
2600 paddd m1, [pd_16]
2601 psrld m1, 5
2602 packusdw m2, m1
2603
2604 palignr m0, m3, 12
2605 movu m1, [r3]
2606 pshufb m1, [pw_ang8_12]
2607 palignr m3, m1, 12
2608
2609 mova m6, m3
2610 pmaddwd m6, [r5 + 13 * 16] ; [29]
2611 paddd m6, [pd_16]
2612 psrld m6, 5
2613 mova m5, m0
2614 pmaddwd m5, [r5 + 13 * 16]
2615 paddd m5, [pd_16]
2616 psrld m5, 5
2617 packusdw m6, m5
2618
2619 pmaddwd m3, [r5 + 8 * 16] ; [24]
2620 paddd m3, [pd_16]
2621 psrld m3, 5
2622 pmaddwd m0, [r5 + 8 * 16]
2623 paddd m0, [pd_16]
2624 psrld m0, 5
2625 packusdw m3, m0
2626
2627 punpckhwd m5, m4, m2
2628 punpcklwd m4, m2
2629 punpckhwd m2, m6, m3
2630 punpcklwd m6, m3
2631
2632 punpckldq m3, m4, m6
2633 punpckhdq m4, m6
2634 punpckldq m6, m5, m2
2635 punpckhdq m5, m2
2636
2637 movh [r0 + 8], m3
2638 movhps [r0 + r1 + 8], m3
2639 movh [r0 + r1 * 2 + 8], m4
2640 movhps [r0 + r4 + 8], m4
2641 lea r0, [r0 + r1 * 4]
2642 movh [r0 + 8], m6
2643 movhps [r0 + r1 + 8], m6
2644 movh [r0 + r1 * 2 + 8], m5
2645 movhps [r0 + r4 + 8], m5
2646
2647 RET
2648
2649cglobal intra_pred_ang8_13, 4,6,8
2650 lea r5, [ang_table + 14 * 16]
2651 add r1, r1
2652
2653 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
2654 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2655
2656 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
2657 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
2658
2659 mova m4, m3
2660 pmaddwd m4, [r5 + 9 * 16] ; [23]
2661 paddd m4, [pd_16]
2662 psrld m4, 5
2663 mova m2, m0
2664 pmaddwd m2, [r5 + 9 * 16]
2665 paddd m2, [pd_16]
2666 psrld m2, 5
2667 packusdw m4, m2
2668
2669 mova m2, m3
2670 pmaddwd m2, [r5] ; [14]
2671 paddd m2, [pd_16]
2672 psrld m2, 5
2673 mova m1, m0
2674 pmaddwd m1, [r5]
2675 paddd m1, [pd_16]
2676 psrld m1, 5
2677 packusdw m2, m1
2678
2679 mova m6, m3
2680 pmaddwd m6, [r5 - 9 * 16] ; [5]
2681 paddd m6, [pd_16]
2682 psrld m6, 5
2683 mova m1, m0
2684 pmaddwd m1, [r5 - 9 * 16]
2685 paddd m1, [pd_16]
2686 psrld m1, 5
2687 packusdw m6, m1
2688
2689 palignr m0, m3, 12
2690 movu m1, [r3]
2691 pshufb m1, [pw_ang8_13]
2692 palignr m3, m1, 12
2693
2694 mova m5, m3
2695 pmaddwd m5, [r5 + 14 * 16] ; [28]
2696 paddd m5, [pd_16]
2697 psrld m5, 5
2698 mova m7, m0
2699 pmaddwd m7, [r5 + 14 * 16]
2700 paddd m7, [pd_16]
2701 psrld m7, 5
2702 packusdw m5, m7
2703
2704 punpckhwd m7, m4, m2
2705 punpcklwd m4, m2
2706 punpckhwd m2, m6, m5
2707 punpcklwd m6, m5
2708
2709 punpckldq m5, m4, m6
2710 punpckhdq m4, m6
2711 punpckldq m6, m7, m2
2712 punpckhdq m7, m2
2713
2714 lea r4, [r1 * 3]
2715 movh [r0], m5
2716 movhps [r0 + r1], m5
2717 movh [r0 + r1 * 2], m4
2718 movhps [r0 + r4], m4
2719 lea r2, [r0 + r1 * 4]
2720 movh [r2], m6
2721 movhps [r2 + r1], m6
2722 movh [r2 + r1 * 2], m7
2723 movhps [r2 + r4], m7
2724
2725 mova m4, m3
2726 pmaddwd m4, [r5 + 5 * 16] ; [19]
2727 paddd m4, [pd_16]
2728 psrld m4, 5
2729 mova m2, m0
2730 pmaddwd m2, [r5 + 5 * 16]
2731 paddd m2, [pd_16]
2732 psrld m2, 5
2733 packusdw m4, m2
2734
2735 mova m2, m3
2736 pmaddwd m2, [r5 - 4 * 16] ; [10]
2737 paddd m2, [pd_16]
2738 psrld m2, 5
2739 mova m5, m0
2740 pmaddwd m5, [r5 - 4 * 16]
2741 paddd m5, [pd_16]
2742 psrld m5, 5
2743 packusdw m2, m5
2744
2745 mova m6, m3
2746 pmaddwd m6, [r5 - 13 * 16] ; [1]
2747 paddd m6, [pd_16]
2748 psrld m6, 5
2749 mova m5, m0
2750 pmaddwd m5, [r5 - 13 * 16]
2751 paddd m5, [pd_16]
2752 psrld m5, 5
2753 packusdw m6, m5
2754
2755 pslldq m1, 2
2756 palignr m0, m3, 12
2757 palignr m3, m1, 12
2758
2759 pmaddwd m3, [r5 + 10 * 16] ; [24]
2760 paddd m3, [pd_16]
2761 psrld m3, 5
2762 pmaddwd m0, [r5 + 10 * 16]
2763 paddd m0, [pd_16]
2764 psrld m0, 5
2765 packusdw m3, m0
2766
2767 punpckhwd m5, m4, m2
2768 punpcklwd m4, m2
2769 punpckhwd m2, m6, m3
2770 punpcklwd m6, m3
2771
2772 punpckldq m3, m4, m6
2773 punpckhdq m4, m6
2774 punpckldq m6, m5, m2
2775 punpckhdq m5, m2
2776
2777 movh [r0 + 8], m3
2778 movhps [r0 + r1 + 8], m3
2779 movh [r0 + r1 * 2 + 8], m4
2780 movhps [r0 + r4 + 8], m4
2781 lea r0, [r0 + r1 * 4]
2782 movh [r0 + 8], m6
2783 movhps [r0 + r1 + 8], m6
2784 movh [r0 + r1 * 2 + 8], m5
2785 movhps [r0 + r4 + 8], m5
2786
2787 RET
2788
2789cglobal intra_pred_ang8_14, 4,6,8
2790 lea r5, [ang_table + 18 * 16]
2791 add r1, r1
2792
2793 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
2794 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2795
2796 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
2797 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
2798
2799 mova m4, m3
2800 pmaddwd m4, [r5 + 1 * 16] ; [19]
2801 paddd m4, [pd_16]
2802 psrld m4, 5
2803 mova m2, m0
2804 pmaddwd m2, [r5 + 1 * 16]
2805 paddd m2, [pd_16]
2806 psrld m2, 5
2807 packusdw m4, m2
2808
2809 mova m2, m3
2810 pmaddwd m2, [r5 - 12 * 16] ; [6]
2811 paddd m2, [pd_16]
2812 psrld m2, 5
2813 mova m1, m0
2814 pmaddwd m1, [r5 - 12 * 16]
2815 paddd m1, [pd_16]
2816 psrld m1, 5
2817 packusdw m2, m1
2818
2819 palignr m0, m3, 12
2820 movu m1, [r3]
2821 pshufb m1, [pw_ang8_14]
2822 palignr m3, m1, 12
2823
2824 mova m6, m3
2825 pmaddwd m6, [r5 + 7 * 16] ; [25]
2826 paddd m6, [pd_16]
2827 psrld m6, 5
2828 mova m5, m0
2829 pmaddwd m5, [r5 + 7 * 16]
2830 paddd m5, [pd_16]
2831 psrld m5, 5
2832 packusdw m6, m5
2833
2834 mova m5, m3
2835 pmaddwd m5, [r5 - 6 * 16] ; [12]
2836 paddd m5, [pd_16]
2837 psrld m5, 5
2838 mova m7, m0
2839 pmaddwd m7, [r5 - 6 * 16]
2840 paddd m7, [pd_16]
2841 psrld m7, 5
2842 packusdw m5, m7
2843
2844 punpckhwd m7, m4, m2
2845 punpcklwd m4, m2
2846 punpckhwd m2, m6, m5
2847 punpcklwd m6, m5
2848
2849 punpckldq m5, m4, m6
2850 punpckhdq m4, m6
2851 punpckldq m6, m7, m2
2852 punpckhdq m7, m2
2853
2854 lea r4, [r1 * 3]
2855 movh [r0], m5
2856 movhps [r0 + r1], m5
2857 movh [r0 + r1 * 2], m4
2858 movhps [r0 + r4], m4
2859 lea r2, [r0 + r1 * 4]
2860 movh [r2], m6
2861 movhps [r2 + r1], m6
2862 movh [r2 + r1 * 2], m7
2863 movhps [r2 + r4], m7
2864
2865 pslldq m1, 2
2866 palignr m0, m3, 12
2867 palignr m3, m1, 12
2868
2869 mova m4, m3
2870 pmaddwd m4, [r5 + 13 * 16] ; [31]
2871 paddd m4, [pd_16]
2872 psrld m4, 5
2873 mova m2, m0
2874 pmaddwd m2, [r5 + 13 * 16]
2875 paddd m2, [pd_16]
2876 psrld m2, 5
2877 packusdw m4, m2
2878
2879 mova m2, m3
2880 pmaddwd m2, [r5] ; [18]
2881 paddd m2, [pd_16]
2882 psrld m2, 5
2883 mova m5, m0
2884 pmaddwd m5, [r5]
2885 paddd m5, [pd_16]
2886 psrld m5, 5
2887 packusdw m2, m5
2888
2889 mova m6, m3
2890 pmaddwd m6, [r5 - 13 * 16] ; [5]
2891 paddd m6, [pd_16]
2892 psrld m6, 5
2893 mova m5, m0
2894 pmaddwd m5, [r5 - 13 * 16]
2895 paddd m5, [pd_16]
2896 psrld m5, 5
2897 packusdw m6, m5
2898
2899 pslldq m1, 2
2900 palignr m0, m3, 12
2901 palignr m3, m1, 12
2902
2903 pmaddwd m3, [r5 + 6 * 16] ; [24]
2904 paddd m3, [pd_16]
2905 psrld m3, 5
2906 pmaddwd m0, [r5 + 6 * 16]
2907 paddd m0, [pd_16]
2908 psrld m0, 5
2909 packusdw m3, m0
2910
2911 punpckhwd m5, m4, m2
2912 punpcklwd m4, m2
2913 punpckhwd m2, m6, m3
2914 punpcklwd m6, m3
2915
2916 punpckldq m3, m4, m6
2917 punpckhdq m4, m6
2918 punpckldq m6, m5, m2
2919 punpckhdq m5, m2
2920
2921 movh [r0 + 8], m3
2922 movhps [r0 + r1 + 8], m3
2923 movh [r0 + r1 * 2 + 8], m4
2924 movhps [r0 + r4 + 8], m4
2925 lea r0, [r0 + r1 * 4]
2926 movh [r0 + 8], m6
2927 movhps [r0 + r1 + 8], m6
2928 movh [r0 + r1 * 2 + 8], m5
2929 movhps [r0 + r4 + 8], m5
2930
2931 RET
2932
2933cglobal intra_pred_ang8_15, 4,6,8
2934 lea r5, [ang_table + 20 * 16]
2935 add r1, r1
2936
2937 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
2938 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2939
2940 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
2941 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
2942
2943 mova m4, m3
2944 pmaddwd m4, [r5 - 5 * 16] ; [15]
2945 paddd m4, [pd_16]
2946 psrld m4, 5
2947 mova m2, m0
2948 pmaddwd m2, [r5 - 5 * 16]
2949 paddd m2, [pd_16]
2950 psrld m2, 5
2951 packusdw m4, m2
2952
2953 palignr m0, m3, 12
2954 movu m1, [r3]
2955 pshufb m1, [pw_ang8_15]
2956 palignr m3, m1, 12
2957
2958 mova m2, m3
2959 pmaddwd m2, [r5 + 10 * 16] ; [30]
2960 paddd m2, [pd_16]
2961 psrld m2, 5
2962 mova m5, m0
2963 pmaddwd m5, [r5 + 10 * 16]
2964 paddd m5, [pd_16]
2965 psrld m5, 5
2966 packusdw m2, m5
2967
2968 mova m6, m3
2969 pmaddwd m6, [r5 - 7 * 16] ; [13]
2970 paddd m6, [pd_16]
2971 psrld m6, 5
2972 mova m5, m0
2973 pmaddwd m5, [r5 - 7 * 16]
2974 paddd m5, [pd_16]
2975 psrld m5, 5
2976 packusdw m6, m5
2977
2978 pslldq m1, 2
2979 palignr m0, m3, 12
2980 palignr m3, m1, 12
2981
2982 mova m5, m3
2983 pmaddwd m5, [r5 + 8 * 16] ; [28]
2984 paddd m5, [pd_16]
2985 psrld m5, 5
2986 mova m7, m0
2987 pmaddwd m7, [r5 + 8 * 16]
2988 paddd m7, [pd_16]
2989 psrld m7, 5
2990 packusdw m5, m7
2991
2992 punpckhwd m7, m4, m2
2993 punpcklwd m4, m2
2994 punpckhwd m2, m6, m5
2995 punpcklwd m6, m5
2996
2997 punpckldq m5, m4, m6
2998 punpckhdq m4, m6
2999 punpckldq m6, m7, m2
3000 punpckhdq m7, m2
3001
3002 lea r4, [r1 * 3]
3003 movh [r0], m5
3004 movhps [r0 + r1], m5
3005 movh [r0 + r1 * 2], m4
3006 movhps [r0 + r4], m4
3007 lea r2, [r0 + r1 * 4]
3008 movh [r2], m6
3009 movhps [r2 + r1], m6
3010 movh [r2 + r1 * 2], m7
3011 movhps [r2 + r4], m7
3012
3013 mova m4, m3
3014 pmaddwd m4, [r5 - 9 * 16] ; [11]
3015 paddd m4, [pd_16]
3016 psrld m4, 5
3017 mova m2, m0
3018 pmaddwd m2, [r5 - 9 * 16]
3019 paddd m2, [pd_16]
3020 psrld m2, 5
3021 packusdw m4, m2
3022
3023 pslldq m1, 2
3024 palignr m0, m3, 12
3025 palignr m3, m1, 12
3026
3027 mova m2, m3
3028 pmaddwd m2, [r5 + 6 * 16] ; [26]
3029 paddd m2, [pd_16]
3030 psrld m2, 5
3031 mova m5, m0
3032 pmaddwd m5, [r5 + 6 * 16]
3033 paddd m5, [pd_16]
3034 psrld m5, 5
3035 packusdw m2, m5
3036
3037 mova m6, m3
3038 pmaddwd m6, [r5 - 11 * 16] ; [9]
3039 paddd m6, [pd_16]
3040 psrld m6, 5
3041 mova m5, m0
3042 pmaddwd m5, [r5 - 11 * 16]
3043 paddd m5, [pd_16]
3044 psrld m5, 5
3045 packusdw m6, m5
3046
3047 pslldq m1, 2
3048 palignr m0, m3, 12
3049 palignr m3, m1, 12
3050 pinsrw m3, [r3 + 16], 0
3051
3052 pmaddwd m3, [r5 + 4 * 16] ; [24]
3053 paddd m3, [pd_16]
3054 psrld m3, 5
3055 pmaddwd m0, [r5 + 4 * 16]
3056 paddd m0, [pd_16]
3057 psrld m0, 5
3058 packusdw m3, m0
3059
3060 punpckhwd m5, m4, m2
3061 punpcklwd m4, m2
3062 punpckhwd m2, m6, m3
3063 punpcklwd m6, m3
3064
3065 punpckldq m3, m4, m6
3066 punpckhdq m4, m6
3067 punpckldq m6, m5, m2
3068 punpckhdq m5, m2
3069
3070 movh [r0 + 8], m3
3071 movhps [r0 + r1 + 8], m3
3072 movh [r0 + r1 * 2 + 8], m4
3073 movhps [r0 + r4 + 8], m4
3074 lea r0, [r0 + r1 * 4]
3075 movh [r0 + 8], m6
3076 movhps [r0 + r1 + 8], m6
3077 movh [r0 + r1 * 2 + 8], m5
3078 movhps [r0 + r4 + 8], m5
3079
3080 RET
3081
3082cglobal intra_pred_ang8_16, 4,6,8
3083 lea r5, [ang_table + 13 * 16]
3084 add r1, r1
3085
3086 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
3087 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3088
3089 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3090 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3091
3092 mova m4, m3
3093 pmaddwd m4, [r5 - 2 * 16] ; [11]
3094 paddd m4, [pd_16]
3095 psrld m4, 5
3096 mova m2, m0
3097 pmaddwd m2, [r5 - 2 * 16]
3098 paddd m2, [pd_16]
3099 psrld m2, 5
3100 packusdw m4, m2
3101
3102 palignr m0, m3, 12
3103 movu m1, [r3]
3104 pshufb m1, [pw_ang8_16]
3105 palignr m3, m1, 12
3106
3107 mova m2, m3
3108 pmaddwd m2, [r5 + 9 * 16] ; [22]
3109 paddd m2, [pd_16]
3110 psrld m2, 5
3111 mova m5, m0
3112 pmaddwd m5, [r5 + 9 * 16]
3113 paddd m5, [pd_16]
3114 psrld m5, 5
3115 packusdw m2, m5
3116
3117 mova m6, m3
3118 pmaddwd m6, [r5 - 12 * 16] ; [1]
3119 paddd m6, [pd_16]
3120 psrld m6, 5
3121 mova m5, m0
3122 pmaddwd m5, [r5 - 12 * 16]
3123 paddd m5, [pd_16]
3124 psrld m5, 5
3125 packusdw m6, m5
3126
3127 pslldq m1, 2
3128 palignr m0, m3, 12
3129 palignr m3, m1, 12
3130
3131 mova m5, m3
3132 pmaddwd m5, [r5 - 1 * 16] ; [12]
3133 paddd m5, [pd_16]
3134 psrld m5, 5
3135 mova m7, m0
3136 pmaddwd m7, [r5 - 1 * 16]
3137 paddd m7, [pd_16]
3138 psrld m7, 5
3139 packusdw m5, m7
3140
3141 punpckhwd m7, m4, m2
3142 punpcklwd m4, m2
3143 punpckhwd m2, m6, m5
3144 punpcklwd m6, m5
3145
3146 punpckldq m5, m4, m6
3147 punpckhdq m4, m6
3148 punpckldq m6, m7, m2
3149 punpckhdq m7, m2
3150
3151 lea r4, [r1 * 3]
3152 movh [r0], m5
3153 movhps [r0 + r1], m5
3154 movh [r0 + r1 * 2], m4
3155 movhps [r0 + r4], m4
3156 lea r2, [r0 + r1 * 4]
3157 movh [r2], m6
3158 movhps [r2 + r1], m6
3159 movh [r2 + r1 * 2], m7
3160 movhps [r2 + r4], m7
3161
3162 pslldq m1, 2
3163 palignr m0, m3, 12
3164 palignr m3, m1, 12
3165
3166 mova m4, m3
3167 pmaddwd m4, [r5 + 10 * 16] ; [23]
3168 paddd m4, [pd_16]
3169 psrld m4, 5
3170 mova m2, m0
3171 pmaddwd m2, [r5 + 10 * 16]
3172 paddd m2, [pd_16]
3173 psrld m2, 5
3174 packusdw m4, m2
3175
3176 mova m2, m3
3177 pmaddwd m2, [r5 - 11 * 16] ; [2]
3178 paddd m2, [pd_16]
3179 psrld m2, 5
3180 mova m5, m0
3181 pmaddwd m5, [r5 - 11 * 16]
3182 paddd m5, [pd_16]
3183 psrld m5, 5
3184 packusdw m2, m5
3185
3186 pslldq m1, 2
3187 palignr m0, m3, 12
3188 palignr m3, m1, 12
3189
3190 mova m6, m3
3191 pmaddwd m6, [r5] ; [13]
3192 paddd m6, [pd_16]
3193 psrld m6, 5
3194 mova m5, m0
3195 pmaddwd m5, [r5]
3196 paddd m5, [pd_16]
3197 psrld m5, 5
3198 packusdw m6, m5
3199
3200 pslldq m1, 2
3201 palignr m0, m3, 12
3202 palignr m3, m1, 12
3203 pinsrw m3, [r3 + 16], 0
3204
3205 pmaddwd m3, [r5 + 11 * 16] ; [24]
3206 paddd m3, [pd_16]
3207 psrld m3, 5
3208 pmaddwd m0, [r5 + 11 * 16]
3209 paddd m0, [pd_16]
3210 psrld m0, 5
3211 packusdw m3, m0
3212
3213 punpckhwd m5, m4, m2
3214 punpcklwd m4, m2
3215 punpckhwd m2, m6, m3
3216 punpcklwd m6, m3
3217
3218 punpckldq m3, m4, m6
3219 punpckhdq m4, m6
3220 punpckldq m6, m5, m2
3221 punpckhdq m5, m2
3222
3223 movh [r0 + 8], m3
3224 movhps [r0 + r1 + 8], m3
3225 movh [r0 + r1 * 2 + 8], m4
3226 movhps [r0 + r4 + 8], m4
3227 lea r0, [r0 + r1 * 4]
3228 movh [r0 + 8], m6
3229 movhps [r0 + r1 + 8], m6
3230 movh [r0 + r1 * 2 + 8], m5
3231 movhps [r0 + r4 + 8], m5
3232
3233 RET
3234
3235cglobal intra_pred_ang8_17, 4,6,8
3236 lea r5, [ang_table + 17 * 16]
3237 add r1, r1
3238
3239 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
3240 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3241
3242 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3243 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3244
3245 mova m4, m3
3246 pmaddwd m4, [r5 - 11 * 16] ; [6]
3247 paddd m4, [pd_16]
3248 psrld m4, 5
3249 mova m2, m0
3250 pmaddwd m2, [r5 - 11 * 16]
3251 paddd m2, [pd_16]
3252 psrld m2, 5
3253 packusdw m4, m2
3254
3255 palignr m0, m3, 12
3256 movu m1, [r3]
3257 pshufb m1, [pw_ang8_17]
3258 palignr m3, m1, 12
3259
3260 mova m2, m3
3261 pmaddwd m2, [r5 - 5 * 16] ; [12]
3262 paddd m2, [pd_16]
3263 psrld m2, 5
3264 mova m5, m0
3265 pmaddwd m5, [r5 - 5 * 16]
3266 paddd m5, [pd_16]
3267 psrld m5, 5
3268 packusdw m2, m5
3269
3270 pslldq m1, 2
3271 palignr m0, m3, 12
3272 palignr m3, m1, 12
3273
3274 mova m6, m3
3275 pmaddwd m6, [r5 + 1 * 16] ; [18]
3276 paddd m6, [pd_16]
3277 psrld m6, 5
3278 mova m5, m0
3279 pmaddwd m5, [r5 + 1 * 16]
3280 paddd m5, [pd_16]
3281 psrld m5, 5
3282 packusdw m6, m5
3283
3284 pslldq m1, 2
3285 palignr m0, m3, 12
3286 palignr m3, m1, 12
3287
3288 mova m5, m3
3289 pmaddwd m5, [r5 + 7 * 16] ; [24]
3290 paddd m5, [pd_16]
3291 psrld m5, 5
3292 mova m7, m0
3293 pmaddwd m7, [r5 + 7 * 16]
3294 paddd m7, [pd_16]
3295 psrld m7, 5
3296 packusdw m5, m7
3297
3298 punpckhwd m7, m4, m2
3299 punpcklwd m4, m2
3300 punpckhwd m2, m6, m5
3301 punpcklwd m6, m5
3302
3303 punpckldq m5, m4, m6
3304 punpckhdq m4, m6
3305 punpckldq m6, m7, m2
3306 punpckhdq m7, m2
3307
3308 lea r4, [r1 * 3]
3309 movh [r0], m5
3310 movhps [r0 + r1], m5
3311 movh [r0 + r1 * 2], m4
3312 movhps [r0 + r4], m4
3313 lea r2, [r0 + r1 * 4]
3314 movh [r2], m6
3315 movhps [r2 + r1], m6
3316 movh [r2 + r1 * 2], m7
3317 movhps [r2 + r4], m7
3318
3319 pslldq m1, 2
3320 palignr m0, m3, 12
3321 palignr m3, m1, 12
3322
3323 mova m4, m3
3324 pmaddwd m4, [r5 + 13 * 16] ; [30]
3325 paddd m4, [pd_16]
3326 psrld m4, 5
3327 mova m2, m0
3328 pmaddwd m2, [r5 + 13 * 16]
3329 paddd m2, [pd_16]
3330 psrld m2, 5
3331 packusdw m4, m2
3332
3333 mova m2, m3
3334 pmaddwd m2, [r5 - 13 * 16] ; [4]
3335 paddd m2, [pd_16]
3336 psrld m2, 5
3337 mova m5, m0
3338 pmaddwd m5, [r5 - 13 * 16]
3339 paddd m5, [pd_16]
3340 psrld m5, 5
3341 packusdw m2, m5
3342
3343 pslldq m1, 2
3344 palignr m0, m3, 12
3345 palignr m3, m1, 12
3346
3347 mova m6, m3
3348 pmaddwd m6, [r5 - 7 * 16] ; [10]
3349 paddd m6, [pd_16]
3350 psrld m6, 5
3351 mova m5, m0
3352 pmaddwd m5, [r5 - 7 * 16]
3353 paddd m5, [pd_16]
3354 psrld m5, 5
3355 packusdw m6, m5
3356
3357 pslldq m1, 2
3358 palignr m0, m3, 12
3359 palignr m3, m1, 12
3360
3361 pmaddwd m3, [r5 - 1 * 16] ; [16]
3362 paddd m3, [pd_16]
3363 psrld m3, 5
3364 pmaddwd m0, [r5 - 1 * 16]
3365 paddd m0, [pd_16]
3366 psrld m0, 5
3367 packusdw m3, m0
3368
3369 punpckhwd m5, m4, m2
3370 punpcklwd m4, m2
3371 punpckhwd m2, m6, m3
3372 punpcklwd m6, m3
3373
3374 punpckldq m3, m4, m6
3375 punpckhdq m4, m6
3376 punpckldq m6, m5, m2
3377 punpckhdq m5, m2
3378
3379 movh [r0 + 8], m3
3380 movhps [r0 + r1 + 8], m3
3381 movh [r0 + r1 * 2 + 8], m4
3382 movhps [r0 + r4 + 8], m4
3383 lea r0, [r0 + r1 * 4]
3384 movh [r0 + 8], m6
3385 movhps [r0 + r1 + 8], m6
3386 movh [r0 + r1 * 2 + 8], m5
3387 movhps [r0 + r4 + 8], m5
3388
3389 RET
3390
3391cglobal intra_pred_ang8_18, 4,5,3
3392 add r1, r1
3393 lea r4, [r1 * 3]
3394 movu m1, [r3]
3395 movu m0, [r2 + 2]
3396 pshufb m0, [pw_swap16]
3397 movu [r0], m1
3398 palignr m2, m1, m0, 14
3399 movu [r0 + r1], m2
3400 palignr m2, m1, m0, 12
3401 movu [r0 + r1 * 2], m2
3402 palignr m2, m1, m0, 10
3403 movu [r0 + r4], m2
3404 lea r0, [r0 + r1 * 4]
3405 palignr m2, m1, m0, 8
3406 movu [r0], m2
3407 palignr m2, m1, m0, 6
3408 movu [r0 + r1], m2
3409 palignr m2, m1, m0, 4
3410 movu [r0 + r1 * 2], m2
3411 palignr m1, m0, 2
3412 movu [r0 + r4], m1
3413 RET
3414
3415cglobal intra_pred_ang8_19, 4,6,8
3416 lea r5, [ang_table + 17 * 16]
3417 add r1, r1
3418
3419 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
3420 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
3421
3422 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3423 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3424
3425 mova m4, m3
3426 pmaddwd m4, [r5 - 11 * 16] ; [6]
3427 paddd m4, [pd_16]
3428 psrld m4, 5
3429 mova m2, m0
3430 pmaddwd m2, [r5 - 11 * 16]
3431 paddd m2, [pd_16]
3432 psrld m2, 5
3433 packusdw m4, m2
3434
3435 palignr m0, m3, 12
3436 movu m1, [r2]
3437 pshufb m1, [pw_ang8_17]
3438 palignr m3, m1, 12
3439
3440 mova m2, m3
3441 pmaddwd m2, [r5 - 5 * 16] ; [12]
3442 paddd m2, [pd_16]
3443 psrld m2, 5
3444 mova m5, m0
3445 pmaddwd m5, [r5 - 5 * 16]
3446 paddd m5, [pd_16]
3447 psrld m5, 5
3448 packusdw m2, m5
3449
3450 pslldq m1, 2
3451 palignr m0, m3, 12
3452 palignr m3, m1, 12
3453
3454 mova m6, m3
3455 pmaddwd m6, [r5 + 1 * 16] ; [18]
3456 paddd m6, [pd_16]
3457 psrld m6, 5
3458 mova m5, m0
3459 pmaddwd m5, [r5 + 1 * 16]
3460 paddd m5, [pd_16]
3461 psrld m5, 5
3462 packusdw m6, m5
3463
3464 pslldq m1, 2
3465 palignr m0, m3, 12
3466 palignr m3, m1, 12
3467
3468 mova m5, m3
3469 pmaddwd m5, [r5 + 7 * 16] ; [24]
3470 paddd m5, [pd_16]
3471 psrld m5, 5
3472 mova m7, m0
3473 pmaddwd m7, [r5 + 7 * 16]
3474 paddd m7, [pd_16]
3475 psrld m7, 5
3476 packusdw m5, m7
3477
3478 lea r4, [r1 * 3]
3479 movu [r0], m4
3480 movu [r0 + r1], m2
3481 movu [r0 + r1 * 2], m6
3482 movu [r0 + r4], m5
3483
3484 pslldq m1, 2
3485 palignr m0, m3, 12
3486 palignr m3, m1, 12
3487
3488 mova m4, m3
3489 pmaddwd m4, [r5 + 13 * 16] ; [30]
3490 paddd m4, [pd_16]
3491 psrld m4, 5
3492 mova m2, m0
3493 pmaddwd m2, [r5 + 13 * 16]
3494 paddd m2, [pd_16]
3495 psrld m2, 5
3496 packusdw m4, m2
3497
3498 mova m2, m3
3499 pmaddwd m2, [r5 - 13 * 16] ; [4]
3500 paddd m2, [pd_16]
3501 psrld m2, 5
3502 mova m5, m0
3503 pmaddwd m5, [r5 - 13 * 16]
3504 paddd m5, [pd_16]
3505 psrld m5, 5
3506 packusdw m2, m5
3507
3508 pslldq m1, 2
3509 palignr m0, m3, 12
3510 palignr m3, m1, 12
3511
3512 mova m6, m3
3513 pmaddwd m6, [r5 - 7 * 16] ; [10]
3514 paddd m6, [pd_16]
3515 psrld m6, 5
3516 mova m5, m0
3517 pmaddwd m5, [r5 - 7 * 16]
3518 paddd m5, [pd_16]
3519 psrld m5, 5
3520 packusdw m6, m5
3521
3522 pslldq m1, 2
3523 palignr m0, m3, 12
3524 palignr m3, m1, 12
3525
3526 pmaddwd m3, [r5 - 1 * 16] ; [16]
3527 paddd m3, [pd_16]
3528 psrld m3, 5
3529 pmaddwd m0, [r5 - 1 * 16]
3530 paddd m0, [pd_16]
3531 psrld m0, 5
3532 packusdw m3, m0
3533
3534 lea r0, [r0 + r1 * 4]
3535 movu [r0], m4
3536 movu [r0 + r1], m2
3537 movu [r0 + r1 * 2], m6
3538 movu [r0 + r4], m3
3539
3540 RET
3541
3542cglobal intra_pred_ang8_20, 4,6,8
3543 lea r5, [ang_table + 13 * 16]
3544 add r1, r1
3545
3546 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
3547 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
3548
3549 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3550 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3551
3552 mova m4, m3
3553 pmaddwd m4, [r5 - 2 * 16] ; [11]
3554 paddd m4, [pd_16]
3555 psrld m4, 5
3556 mova m2, m0
3557 pmaddwd m2, [r5 - 2 * 16]
3558 paddd m2, [pd_16]
3559 psrld m2, 5
3560 packusdw m4, m2
3561
3562 palignr m0, m3, 12
3563 movu m1, [r2]
3564 pshufb m1, [pw_ang8_16]
3565 palignr m3, m1, 12
3566
3567 mova m2, m3
3568 pmaddwd m2, [r5 + 9 * 16] ; [22]
3569 paddd m2, [pd_16]
3570 psrld m2, 5
3571 mova m5, m0
3572 pmaddwd m5, [r5 + 9 * 16]
3573 paddd m5, [pd_16]
3574 psrld m5, 5
3575 packusdw m2, m5
3576
3577 mova m6, m3
3578 pmaddwd m6, [r5 - 12 * 16] ; [1]
3579 paddd m6, [pd_16]
3580 psrld m6, 5
3581 mova m5, m0
3582 pmaddwd m5, [r5 - 12 * 16]
3583 paddd m5, [pd_16]
3584 psrld m5, 5
3585 packusdw m6, m5
3586
3587 pslldq m1, 2
3588 palignr m0, m3, 12
3589 palignr m3, m1, 12
3590
3591 mova m5, m3
3592 pmaddwd m5, [r5 - 1 * 16] ; [12]
3593 paddd m5, [pd_16]
3594 psrld m5, 5
3595 mova m7, m0
3596 pmaddwd m7, [r5 - 1 * 16]
3597 paddd m7, [pd_16]
3598 psrld m7, 5
3599 packusdw m5, m7
3600
3601 lea r4, [r1 * 3]
3602 movu [r0], m4
3603 movu [r0 + r1], m2
3604 movu [r0 + r1 * 2], m6
3605 movu [r0 + r4], m5
3606
3607 pslldq m1, 2
3608 palignr m0, m3, 12
3609 palignr m3, m1, 12
3610
3611 mova m4, m3
3612 pmaddwd m4, [r5 + 10 * 16] ; [23]
3613 paddd m4, [pd_16]
3614 psrld m4, 5
3615 mova m2, m0
3616 pmaddwd m2, [r5 + 10 * 16]
3617 paddd m2, [pd_16]
3618 psrld m2, 5
3619 packusdw m4, m2
3620
3621 mova m2, m3
3622 pmaddwd m2, [r5 - 11 * 16] ; [2]
3623 paddd m2, [pd_16]
3624 psrld m2, 5
3625 mova m5, m0
3626 pmaddwd m5, [r5 - 11 * 16]
3627 paddd m5, [pd_16]
3628 psrld m5, 5
3629 packusdw m2, m5
3630
3631 pslldq m1, 2
3632 palignr m0, m3, 12
3633 palignr m3, m1, 12
3634
3635 mova m6, m3
3636 pmaddwd m6, [r5] ; [13]
3637 paddd m6, [pd_16]
3638 psrld m6, 5
3639 mova m5, m0
3640 pmaddwd m5, [r5]
3641 paddd m5, [pd_16]
3642 psrld m5, 5
3643 packusdw m6, m5
3644
3645 pslldq m1, 2
3646 palignr m0, m3, 12
3647 palignr m3, m1, 12
3648 pinsrw m3, [r2 + 16], 0
3649
3650 pmaddwd m3, [r5 + 11 * 16] ; [24]
3651 paddd m3, [pd_16]
3652 psrld m3, 5
3653 pmaddwd m0, [r5 + 11 * 16]
3654 paddd m0, [pd_16]
3655 psrld m0, 5
3656 packusdw m3, m0
3657
3658 lea r0, [r0 + r1 * 4]
3659 movu [r0], m4
3660 movu [r0 + r1], m2
3661 movu [r0 + r1 * 2], m6
3662 movu [r0 + r4], m3
3663
3664 RET
3665
3666cglobal intra_pred_ang8_21, 4,6,8
3667 lea r5, [ang_table + 20 * 16]
3668 add r1, r1
3669
3670 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
3671 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
3672
3673 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3674 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3675
3676 mova m4, m3
3677 pmaddwd m4, [r5 - 5 * 16] ; [15]
3678 paddd m4, [pd_16]
3679 psrld m4, 5
3680 mova m2, m0
3681 pmaddwd m2, [r5 - 5 * 16]
3682 paddd m2, [pd_16]
3683 psrld m2, 5
3684 packusdw m4, m2
3685
3686 palignr m0, m3, 12
3687 movu m1, [r2]
3688 pshufb m1, [pw_ang8_15]
3689 palignr m3, m1, 12
3690
3691 mova m2, m3
3692 pmaddwd m2, [r5 + 10 * 16] ; [30]
3693 paddd m2, [pd_16]
3694 psrld m2, 5
3695 mova m5, m0
3696 pmaddwd m5, [r5 + 10 * 16]
3697 paddd m5, [pd_16]
3698 psrld m5, 5
3699 packusdw m2, m5
3700
3701 mova m6, m3
3702 pmaddwd m6, [r5 - 7 * 16] ; [13]
3703 paddd m6, [pd_16]
3704 psrld m6, 5
3705 mova m5, m0
3706 pmaddwd m5, [r5 - 7 * 16]
3707 paddd m5, [pd_16]
3708 psrld m5, 5
3709 packusdw m6, m5
3710
3711 pslldq m1, 2
3712 palignr m0, m3, 12
3713 palignr m3, m1, 12
3714
3715 mova m5, m3
3716 pmaddwd m5, [r5 + 8 * 16] ; [28]
3717 paddd m5, [pd_16]
3718 psrld m5, 5
3719 mova m7, m0
3720 pmaddwd m7, [r5 + 8 * 16]
3721 paddd m7, [pd_16]
3722 psrld m7, 5
3723 packusdw m5, m7
3724
3725 lea r4, [r1 * 3]
3726 movu [r0], m4
3727 movu [r0 + r1], m2
3728 movu [r0 + r1 * 2], m6
3729 movu [r0 + r4], m5
3730
3731 mova m4, m3
3732 pmaddwd m4, [r5 - 9 * 16] ; [11]
3733 paddd m4, [pd_16]
3734 psrld m4, 5
3735 mova m2, m0
3736 pmaddwd m2, [r5 - 9 * 16]
3737 paddd m2, [pd_16]
3738 psrld m2, 5
3739 packusdw m4, m2
3740
3741 pslldq m1, 2
3742 palignr m0, m3, 12
3743 palignr m3, m1, 12
3744
3745 mova m2, m3
3746 pmaddwd m2, [r5 + 6 * 16] ; [26]
3747 paddd m2, [pd_16]
3748 psrld m2, 5
3749 mova m5, m0
3750 pmaddwd m5, [r5 + 6 * 16]
3751 paddd m5, [pd_16]
3752 psrld m5, 5
3753 packusdw m2, m5
3754
3755 mova m6, m3
3756 pmaddwd m6, [r5 - 11 * 16] ; [9]
3757 paddd m6, [pd_16]
3758 psrld m6, 5
3759 mova m5, m0
3760 pmaddwd m5, [r5 - 11 * 16]
3761 paddd m5, [pd_16]
3762 psrld m5, 5
3763 packusdw m6, m5
3764
3765 pslldq m1, 2
3766 palignr m0, m3, 12
3767 palignr m3, m1, 12
3768 pinsrw m3, [r2 + 16], 0
3769
3770 pmaddwd m3, [r5 + 4 * 16] ; [24]
3771 paddd m3, [pd_16]
3772 psrld m3, 5
3773 pmaddwd m0, [r5 + 4 * 16]
3774 paddd m0, [pd_16]
3775 psrld m0, 5
3776 packusdw m3, m0
3777
3778 lea r0, [r0 + r1 * 4]
3779 movu [r0], m4
3780 movu [r0 + r1], m2
3781 movu [r0 + r1 * 2], m6
3782 movu [r0 + r4], m3
3783
3784 RET
3785
3786cglobal intra_pred_ang8_22, 4,6,8
3787 lea r5, [ang_table + 18 * 16]
3788 add r1, r1
3789
3790 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
3791 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
3792
3793 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3794 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3795
3796 mova m4, m3
3797 pmaddwd m4, [r5 + 1 * 16] ; [19]
3798 paddd m4, [pd_16]
3799 psrld m4, 5
3800 mova m2, m0
3801 pmaddwd m2, [r5 + 1 * 16]
3802 paddd m2, [pd_16]
3803 psrld m2, 5
3804 packusdw m4, m2
3805
3806 mova m2, m3
3807 pmaddwd m2, [r5 - 12 * 16] ; [6]
3808 paddd m2, [pd_16]
3809 psrld m2, 5
3810 mova m1, m0
3811 pmaddwd m1, [r5 - 12 * 16]
3812 paddd m1, [pd_16]
3813 psrld m1, 5
3814 packusdw m2, m1
3815
3816 palignr m0, m3, 12
3817 movu m1, [r2]
3818 pshufb m1, [pw_ang8_14]
3819 palignr m3, m1, 12
3820
3821 mova m6, m3
3822 pmaddwd m6, [r5 + 7 * 16] ; [25]
3823 paddd m6, [pd_16]
3824 psrld m6, 5
3825 mova m5, m0
3826 pmaddwd m5, [r5 + 7 * 16]
3827 paddd m5, [pd_16]
3828 psrld m5, 5
3829 packusdw m6, m5
3830
3831 mova m5, m3
3832 pmaddwd m5, [r5 - 6 * 16] ; [12]
3833 paddd m5, [pd_16]
3834 psrld m5, 5
3835 mova m7, m0
3836 pmaddwd m7, [r5 - 6 * 16]
3837 paddd m7, [pd_16]
3838 psrld m7, 5
3839 packusdw m5, m7
3840
3841 lea r4, [r1 * 3]
3842 movu [r0], m4
3843 movu [r0 + r1], m2
3844 movu [r0 + r1 * 2], m6
3845 movu [r0 + r4], m5
3846
3847 pslldq m1, 2
3848 palignr m0, m3, 12
3849 palignr m3, m1, 12
3850
3851 mova m4, m3
3852 pmaddwd m4, [r5 + 13 * 16] ; [31]
3853 paddd m4, [pd_16]
3854 psrld m4, 5
3855 mova m2, m0
3856 pmaddwd m2, [r5 + 13 * 16]
3857 paddd m2, [pd_16]
3858 psrld m2, 5
3859 packusdw m4, m2
3860
3861 mova m2, m3
3862 pmaddwd m2, [r5] ; [18]
3863 paddd m2, [pd_16]
3864 psrld m2, 5
3865 mova m5, m0
3866 pmaddwd m5, [r5]
3867 paddd m5, [pd_16]
3868 psrld m5, 5
3869 packusdw m2, m5
3870
3871 mova m6, m3
3872 pmaddwd m6, [r5 - 13 * 16] ; [5]
3873 paddd m6, [pd_16]
3874 psrld m6, 5
3875 mova m5, m0
3876 pmaddwd m5, [r5 - 13 * 16]
3877 paddd m5, [pd_16]
3878 psrld m5, 5
3879 packusdw m6, m5
3880
3881 pslldq m1, 2
3882 palignr m0, m3, 12
3883 palignr m3, m1, 12
3884
3885 pmaddwd m3, [r5 + 6 * 16] ; [24]
3886 paddd m3, [pd_16]
3887 psrld m3, 5
3888 pmaddwd m0, [r5 + 6 * 16]
3889 paddd m0, [pd_16]
3890 psrld m0, 5
3891 packusdw m3, m0
3892
3893 lea r0, [r0 + r1 * 4]
3894 movu [r0], m4
3895 movu [r0 + r1], m2
3896 movu [r0 + r1 * 2], m6
3897 movu [r0 + r4], m3
3898
3899 RET
3900
3901cglobal intra_pred_ang8_23, 4,6,8
3902 lea r5, [ang_table + 14 * 16]
3903 add r1, r1
3904
3905 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
3906 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
3907
3908 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3909 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3910
3911 mova m4, m3
3912 pmaddwd m4, [r5 + 9 * 16] ; [23]
3913 paddd m4, [pd_16]
3914 psrld m4, 5
3915 mova m2, m0
3916 pmaddwd m2, [r5 + 9 * 16]
3917 paddd m2, [pd_16]
3918 psrld m2, 5
3919 packusdw m4, m2
3920
3921 mova m2, m3
3922 pmaddwd m2, [r5] ; [14]
3923 paddd m2, [pd_16]
3924 psrld m2, 5
3925 mova m1, m0
3926 pmaddwd m1, [r5]
3927 paddd m1, [pd_16]
3928 psrld m1, 5
3929 packusdw m2, m1
3930
3931 mova m6, m3
3932 pmaddwd m6, [r5 - 9 * 16] ; [5]
3933 paddd m6, [pd_16]
3934 psrld m6, 5
3935 mova m1, m0
3936 pmaddwd m1, [r5 - 9 * 16]
3937 paddd m1, [pd_16]
3938 psrld m1, 5
3939 packusdw m6, m1
3940
3941 palignr m0, m3, 12
3942 movu m1, [r2]
3943 pshufb m1, [pw_ang8_13]
3944 palignr m3, m1, 12
3945
3946 mova m5, m3
3947 pmaddwd m5, [r5 + 14 * 16] ; [28]
3948 paddd m5, [pd_16]
3949 psrld m5, 5
3950 mova m7, m0
3951 pmaddwd m7, [r5 + 14 * 16]
3952 paddd m7, [pd_16]
3953 psrld m7, 5
3954 packusdw m5, m7
3955
3956 lea r4, [r1 * 3]
3957 movu [r0], m4
3958 movu [r0 + r1], m2
3959 movu [r0 + r1 * 2], m6
3960 movu [r0 + r4], m5
3961
3962 mova m4, m3
3963 pmaddwd m4, [r5 + 5 * 16] ; [19]
3964 paddd m4, [pd_16]
3965 psrld m4, 5
3966 mova m2, m0
3967 pmaddwd m2, [r5 + 5 * 16]
3968 paddd m2, [pd_16]
3969 psrld m2, 5
3970 packusdw m4, m2
3971
3972 mova m2, m3
3973 pmaddwd m2, [r5 - 4 * 16] ; [10]
3974 paddd m2, [pd_16]
3975 psrld m2, 5
3976 mova m5, m0
3977 pmaddwd m5, [r5 - 4 * 16]
3978 paddd m5, [pd_16]
3979 psrld m5, 5
3980 packusdw m2, m5
3981
3982 mova m6, m3
3983 pmaddwd m6, [r5 - 13 * 16] ; [1]
3984 paddd m6, [pd_16]
3985 psrld m6, 5
3986 mova m5, m0
3987 pmaddwd m5, [r5 - 13 * 16]
3988 paddd m5, [pd_16]
3989 psrld m5, 5
3990 packusdw m6, m5
3991
3992 pslldq m1, 2
3993 palignr m0, m3, 12
3994 palignr m3, m1, 12
3995
3996 pmaddwd m3, [r5 + 10 * 16] ; [24]
3997 paddd m3, [pd_16]
3998 psrld m3, 5
3999 pmaddwd m0, [r5 + 10 * 16]
4000 paddd m0, [pd_16]
4001 psrld m0, 5
4002 packusdw m3, m0
4003
4004 lea r0, [r0 + r1 * 4]
4005 movu [r0], m4
4006 movu [r0 + r1], m2
4007 movu [r0 + r1 * 2], m6
4008 movu [r0 + r4], m3
4009
4010 RET
4011
4012cglobal intra_pred_ang8_24, 4,6,7
4013 lea r5, [ang_table + 16 * 16]
4014 add r1, r1
4015
4016 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
4017 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
4018
4019 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4020 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4021
4022 mova m4, m3
4023 pmaddwd m4, [r5 + 11 * 16] ; [27]
4024 paddd m4, [pd_16]
4025 psrld m4, 5
4026 mova m2, m0
4027 pmaddwd m2, [r5 + 11 * 16]
4028 paddd m2, [pd_16]
4029 psrld m2, 5
4030 packusdw m4, m2
4031
4032 mova m2, m3
4033 pmaddwd m2, [r5 + 6 * 16] ; [22]
4034 paddd m2, [pd_16]
4035 psrld m2, 5
4036 mova m1, m0
4037 pmaddwd m1, [r5 + 6 * 16]
4038 paddd m1, [pd_16]
4039 psrld m1, 5
4040 packusdw m2, m1
4041
4042 mova m6, m3
4043 pmaddwd m6, [r5 + 1 * 16] ; [17]
4044 paddd m6, [pd_16]
4045 psrld m6, 5
4046 mova m1, m0
4047 pmaddwd m1, [r5 + 1 * 16]
4048 paddd m1, [pd_16]
4049 psrld m1, 5
4050 packusdw m6, m1
4051
4052 mova m5, m3
4053 pmaddwd m5, [r5 - 4 * 16] ; [12]
4054 paddd m5, [pd_16]
4055 psrld m5, 5
4056 mova m1, m0
4057 pmaddwd m1, [r5 - 4 * 16]
4058 paddd m1, [pd_16]
4059 psrld m1, 5
4060 packusdw m5, m1
4061
4062 lea r4, [r1 * 3]
4063 movu [r0], m4
4064 movu [r0 + r1], m2
4065 movu [r0 + r1 * 2], m6
4066 movu [r0 + r4], m5
4067
4068 mova m4, m3
4069 pmaddwd m4, [r5 - 9 * 16] ; [7]
4070 paddd m4, [pd_16]
4071 psrld m4, 5
4072 mova m2, m0
4073 pmaddwd m2, [r5 - 9 * 16]
4074 paddd m2, [pd_16]
4075 psrld m2, 5
4076 packusdw m4, m2
4077
4078 mova m2, m3
4079 pmaddwd m2, [r5 - 14 * 16] ; [2]
4080 paddd m2, [pd_16]
4081 psrld m2, 5
4082 mova m1, m0
4083 pmaddwd m1, [r5 - 14 * 16]
4084 paddd m1, [pd_16]
4085 psrld m1, 5
4086 packusdw m2, m1
4087
4088 palignr m0, m3, 12
4089 movu m1, [r2]
4090 pshufb m1, [pw_ang8_12]
4091 palignr m3, m1, 12
4092
4093 mova m6, m3
4094 pmaddwd m6, [r5 + 13 * 16] ; [29]
4095 paddd m6, [pd_16]
4096 psrld m6, 5
4097 mova m5, m0
4098 pmaddwd m5, [r5 + 13 * 16]
4099 paddd m5, [pd_16]
4100 psrld m5, 5
4101 packusdw m6, m5
4102
4103 pmaddwd m3, [r5 + 8 * 16] ; [24]
4104 paddd m3, [pd_16]
4105 psrld m3, 5
4106 pmaddwd m0, [r5 + 8 * 16]
4107 paddd m0, [pd_16]
4108 psrld m0, 5
4109 packusdw m3, m0
4110
4111 lea r0, [r0 + r1 * 4]
4112 movu [r0], m4
4113 movu [r0 + r1], m2
4114 movu [r0 + r1 * 2], m6
4115 movu [r0 + r4], m3
4116
4117 RET
4118
4119cglobal intra_pred_ang8_25, 3,5,7
4120 mov r2, r3mp
4121 lea r3, [ang_table + 23 * 16]
4122 add r1, r1
4123
4124 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
4125 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4126
4127 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4128 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4129
4130 mova m4, m3
4131 pmaddwd m4, [r3 + 7 * 16] ; [30]
4132 paddd m4, [pd_16]
4133 psrld m4, 5
4134 mova m2, m0
4135 pmaddwd m2, [r3 + 7 * 16]
4136 paddd m2, [pd_16]
4137 psrld m2, 5
4138 packusdw m4, m2
4139
4140 mova m2, m3
4141 pmaddwd m2, [r3 + 5 * 16] ; [28]
4142 paddd m2, [pd_16]
4143 psrld m2, 5
4144 mova m1, m0
4145 pmaddwd m1, [r3 + 5 * 16]
4146 paddd m1, [pd_16]
4147 psrld m1, 5
4148 packusdw m2, m1
4149
4150 mova m6, m3
4151 pmaddwd m6, [r3 + 3 * 16] ; [26]
4152 paddd m6, [pd_16]
4153 psrld m6, 5
4154 mova m1, m0
4155 pmaddwd m1, [r3 + 3 * 16]
4156 paddd m1, [pd_16]
4157 psrld m1, 5
4158 packusdw m6, m1
4159
4160 mova m5, m3
4161 pmaddwd m5, [r3 + 1 * 16] ; [24]
4162 paddd m5, [pd_16]
4163 psrld m5, 5
4164 mova m1, m0
4165 pmaddwd m1, [r3 + 1 * 16]
4166 paddd m1, [pd_16]
4167 psrld m1, 5
4168 packusdw m5, m1
4169
4170 lea r4, [r1 * 3]
4171 movu [r0], m4
4172 movu [r0 + r1], m2
4173 movu [r0 + r1 * 2], m6
4174 movu [r0 + r4], m5
4175
4176 mova m4, m3
4177 pmaddwd m4, [r3 - 1 * 16] ; [22]
4178 paddd m4, [pd_16]
4179 psrld m4, 5
4180 mova m2, m0
4181 pmaddwd m2, [r3 - 1 * 16]
4182 paddd m2, [pd_16]
4183 psrld m2, 5
4184 packusdw m4, m2
4185
4186 mova m2, m3
4187 pmaddwd m2, [r3 - 3 * 16] ; [20]
4188 paddd m2, [pd_16]
4189 psrld m2, 5
4190 mova m1, m0
4191 pmaddwd m1, [r3 - 3 * 16]
4192 paddd m1, [pd_16]
4193 psrld m1, 5
4194 packusdw m2, m1
4195
4196 mova m6, m3
4197 pmaddwd m6, [r3 - 5 * 16] ; [18]
4198 paddd m6, [pd_16]
4199 psrld m6, 5
4200 mova m5, m0
4201 pmaddwd m5, [r3 - 5 * 16]
4202 paddd m5, [pd_16]
4203 psrld m5, 5
4204 packusdw m6, m5
4205
4206 pmaddwd m3, [r3 - 7 * 16] ; [16]
4207 paddd m3, [pd_16]
4208 psrld m3, 5
4209 pmaddwd m0, [r3 - 7 * 16]
4210 paddd m0, [pd_16]
4211 psrld m0, 5
4212 packusdw m3, m0
4213
4214 lea r0, [r0 + r1 * 4]
4215 movu [r0], m4
4216 movu [r0 + r1], m2
4217 movu [r0 + r1 * 2], m6
4218 movu [r0 + r4], m3
4219
4220 RET
4221
4222cglobal intra_pred_ang8_26, 4,5,3
4223 movu m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
4224 add r1, r1
4225 lea r4, [r1 * 3]
4226
4227 movu [r0], m0
4228 movu [r0 + r1], m0
4229 movu [r0 + r1 * 2], m0
4230 movu [r0 + r4], m0
4231
4232 lea r3, [r0 + r1 *4]
4233 movu [r3], m0
4234 movu [r3 + r1], m0
4235 movu [r3 + r1 * 2], m0
4236 movu [r3 + r4], m0
4237
4238 cmp r5m, byte 0
4239 jz .quit
4240
4241 ; filter
4242
4243 pshufb m0, [pw_unpackwdq]
4244 movh m1, [r2] ; [3 2 1 0]
4245 pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
4246 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4247 psubw m1, m2
4248 psraw m1, 1
4249 paddw m0, m1
4250 pxor m1, m1
4251 pmaxsw m0, m1
4252 pminsw m0, [pw_1023]
4253 pextrw [r0], m0, 0
4254 pextrw [r0 + r1], m0, 1
4255 pextrw [r0 + r1 * 2], m0, 2
4256 pextrw [r0 + r4], m0, 3
4257 pextrw [r3], m0, 4
4258 pextrw [r3 + r1], m0, 5
4259 pextrw [r3 + r1 * 2], m0, 6
4260 pextrw [r3 + r4], m0, 7
4261
4262.quit:
4263 RET
4264
4265cglobal intra_pred_ang8_27, 3,5,7
4266 mov r2, r3mp
4267 lea r3, [ang_table + 9 * 16]
4268 add r1, r1
4269
4270 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4271 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
4272
4273 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
4274 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
4275
4276 mova m4, m3
4277 pmaddwd m4, [r3 - 7 * 16] ; [2]
4278 paddd m4, [pd_16]
4279 psrld m4, 5
4280 mova m2, m0
4281 pmaddwd m2, [r3 - 7 * 16]
4282 paddd m2, [pd_16]
4283 psrld m2, 5
4284 packusdw m4, m2
4285
4286 mova m2, m3
4287 pmaddwd m2, [r3 - 5 * 16] ; [4]
4288 paddd m2, [pd_16]
4289 psrld m2, 5
4290 mova m1, m0
4291 pmaddwd m1, [r3 - 5 * 16]
4292 paddd m1, [pd_16]
4293 psrld m1, 5
4294 packusdw m2, m1
4295
4296 mova m6, m3
4297 pmaddwd m6, [r3 - 3 * 16] ; [6]
4298 paddd m6, [pd_16]
4299 psrld m6, 5
4300 mova m1, m0
4301 pmaddwd m1, [r3 - 3 * 16]
4302 paddd m1, [pd_16]
4303 psrld m1, 5
4304 packusdw m6, m1
4305
4306 mova m5, m3
4307 pmaddwd m5, [r3 - 1 * 16] ; [8]
4308 paddd m5, [pd_16]
4309 psrld m5, 5
4310 mova m1, m0
4311 pmaddwd m1, [r3 - 1 * 16]
4312 paddd m1, [pd_16]
4313 psrld m1, 5
4314 packusdw m5, m1
4315
4316 lea r4, [r1 * 3]
4317 movu [r0], m4
4318 movu [r0 + r1], m2
4319 movu [r0 + r1 * 2], m6
4320 movu [r0 + r4], m5
4321
4322 mova m4, m3
4323 pmaddwd m4, [r3 + 1 * 16] ; [10]
4324 paddd m4, [pd_16]
4325 psrld m4, 5
4326 mova m2, m0
4327 pmaddwd m2, [r3 + 1 * 16]
4328 paddd m2, [pd_16]
4329 psrld m2, 5
4330 packusdw m4, m2
4331
4332 mova m2, m3
4333 pmaddwd m2, [r3 + 3 * 16] ; [12]
4334 paddd m2, [pd_16]
4335 psrld m2, 5
4336 mova m1, m0
4337 pmaddwd m1, [r3 + 3 * 16]
4338 paddd m1, [pd_16]
4339 psrld m1, 5
4340 packusdw m2, m1
4341
4342 mova m6, m3
4343 pmaddwd m6, [r3 + 5 * 16] ; [14]
4344 paddd m6, [pd_16]
4345 psrld m6, 5
4346 mova m5, m0
4347 pmaddwd m5, [r3 + 5 * 16]
4348 paddd m5, [pd_16]
4349 psrld m5, 5
4350 packusdw m6, m5
4351
4352 pmaddwd m3, [r3 + 7 * 16] ; [16]
4353 paddd m3, [pd_16]
4354 psrld m3, 5
4355 pmaddwd m0, [r3 + 7 * 16]
4356 paddd m0, [pd_16]
4357 psrld m0, 5
4358 packusdw m3, m0
4359
4360 lea r0, [r0 + r1 * 4]
4361 movu [r0], m4
4362 movu [r0 + r1], m2
4363 movu [r0 + r1 * 2], m6
4364 movu [r0 + r4], m3
4365
4366 RET
4367
4368cglobal intra_pred_ang8_28, 3,5,7
4369 mov r2, r3mp
4370 lea r3, [ang_table + 17 * 16]
4371 add r1, r1
4372
4373 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4374 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
4375
4376 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
4377 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
4378
4379 mova m4, m3
4380 pmaddwd m4, [r3 - 12 * 16] ; [5]
4381 paddd m4, [pd_16]
4382 psrld m4, 5
4383 mova m2, m0
4384 pmaddwd m2, [r3 - 12 * 16]
4385 paddd m2, [pd_16]
4386 psrld m2, 5
4387 packusdw m4, m2
4388
4389 mova m2, m3
4390 pmaddwd m2, [r3 - 7 * 16] ; [10]
4391 paddd m2, [pd_16]
4392 psrld m2, 5
4393 mova m1, m0
4394 pmaddwd m1, [r3 - 7 * 16]
4395 paddd m1, [pd_16]
4396 psrld m1, 5
4397 packusdw m2, m1
4398
4399 mova m6, m3
4400 pmaddwd m6, [r3 - 2 * 16] ; [15]
4401 paddd m6, [pd_16]
4402 psrld m6, 5
4403 mova m1, m0
4404 pmaddwd m1, [r3 - 2 * 16]
4405 paddd m1, [pd_16]
4406 psrld m1, 5
4407 packusdw m6, m1
4408
4409 mova m5, m3
4410 pmaddwd m5, [r3 + 3 * 16] ; [20]
4411 paddd m5, [pd_16]
4412 psrld m5, 5
4413 mova m1, m0
4414 pmaddwd m1, [r3 + 3 * 16]
4415 paddd m1, [pd_16]
4416 psrld m1, 5
4417 packusdw m5, m1
4418
4419 lea r4, [r1 * 3]
4420 movu [r0], m4
4421 movu [r0 + r1], m2
4422 movu [r0 + r1 * 2], m6
4423 movu [r0 + r4], m5
4424
4425 mova m4, m3
4426 pmaddwd m4, [r3 + 8 * 16] ; [25]
4427 paddd m4, [pd_16]
4428 psrld m4, 5
4429 mova m2, m0
4430 pmaddwd m2, [r3 + 8 * 16]
4431 paddd m2, [pd_16]
4432 psrld m2, 5
4433 packusdw m4, m2
4434
4435 mova m2, m3
4436 pmaddwd m2, [r3 + 13 * 16] ; [30]
4437 paddd m2, [pd_16]
4438 psrld m2, 5
4439 mova m1, m0
4440 pmaddwd m1, [r3 + 13 * 16]
4441 paddd m1, [pd_16]
4442 psrld m1, 5
4443 packusdw m2, m1
4444
4445 movh m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4446
4447 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4448 mova m5, m6
4449 pmaddwd m6, [r3 - 14 * 16] ; [3]
4450 paddd m6, [pd_16]
4451 psrld m6, 5
4452 palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6]
4453 mova m3, m1
4454 pmaddwd m1, [r3 - 14 * 16]
4455 paddd m1, [pd_16]
4456 psrld m1, 5
4457 packusdw m6, m1
4458
4459 pmaddwd m5, [r3 - 9 * 16] ; [8]
4460 paddd m5, [pd_16]
4461 psrld m5, 5
4462 pmaddwd m3, [r3 - 9 * 16]
4463 paddd m3, [pd_16]
4464 psrld m3, 5
4465 packusdw m5, m3
4466
4467 lea r0, [r0 + r1 * 4]
4468 movu [r0], m4
4469 movu [r0 + r1], m2
4470 movu [r0 + r1 * 2], m6
4471 movu [r0 + r4], m5
4472
4473 RET
4474
4475cglobal intra_pred_ang8_29, 3,5,8
4476 mov r2, r3mp
4477 lea r3, [ang_table + 18 * 16]
4478 add r1, r1
4479
4480 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4481 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4482 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
4483 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
4484
4485 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
4486 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
4487 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
4488
4489 mova m4, m3
4490 pmaddwd m4, [r3 - 9 * 16] ; [9]
4491 paddd m4, [pd_16]
4492 psrld m4, 5
4493 mova m2, m0
4494 pmaddwd m2, [r3 - 9 * 16]
4495 paddd m2, [pd_16]
4496 psrld m2, 5
4497 packusdw m4, m2
4498
4499 mova m2, m3
4500 pmaddwd m2, [r3] ; [18]
4501 paddd m2, [pd_16]
4502 psrld m2, 5
4503 mova m1, m0
4504 pmaddwd m1, [r3]
4505 paddd m1, [pd_16]
4506 psrld m1, 5
4507 packusdw m2, m1
4508
4509 mova m6, m3
4510 pmaddwd m6, [r3 + 9 * 16] ; [27]
4511 paddd m6, [pd_16]
4512 psrld m6, 5
4513 mova m1, m0
4514 pmaddwd m1, [r3 + 9 * 16]
4515 paddd m1, [pd_16]
4516 psrld m1, 5
4517 packusdw m6, m1
4518
4519 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4520 pmaddwd m7, [r3 - 14 * 16] ; [4]
4521 paddd m7, [pd_16]
4522 psrld m7, 5
4523 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4524 pmaddwd m1, [r3 - 14 * 16]
4525 paddd m1, [pd_16]
4526 psrld m1, 5
4527 packusdw m7, m1
4528
4529 lea r4, [r1 * 3]
4530 movu [r0], m4
4531 movu [r0 + r1], m2
4532 movu [r0 + r1 * 2], m6
4533 movu [r0 + r4], m7
4534
4535 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4536 mova m6, m4
4537 pmaddwd m4, [r3 - 5 * 16] ; [13]
4538 paddd m4, [pd_16]
4539 psrld m4, 5
4540 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4541 mova m7, m2
4542 pmaddwd m2, [r3 - 5 * 16]
4543 paddd m2, [pd_16]
4544 psrld m2, 5
4545 packusdw m4, m2
4546
4547 pmaddwd m2, m6, [r3 + 4 * 16] ; [22]
4548 paddd m2, [pd_16]
4549 psrld m2, 5
4550 pmaddwd m1, m7, [r3 + 4 * 16]
4551 paddd m1, [pd_16]
4552 psrld m1, 5
4553 packusdw m2, m1
4554
4555 pmaddwd m6, [r3 + 13 * 16] ; [31]
4556 paddd m6, [pd_16]
4557 psrld m6, 5
4558 pmaddwd m7, [r3 + 13 * 16]
4559 paddd m7, [pd_16]
4560 psrld m7, 5
4561 packusdw m6, m7
4562
4563 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4564 pmaddwd m7, [r3 - 10 * 16] ; [8]
4565 paddd m7, [pd_16]
4566 psrld m7, 5
4567 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4568 pmaddwd m5, [r3 - 10 * 16]
4569 paddd m5, [pd_16]
4570 psrld m5, 5
4571 packusdw m7, m5
4572
4573 lea r0, [r0 + r1 * 4]
4574 movu [r0], m4
4575 movu [r0 + r1], m2
4576 movu [r0 + r1 * 2], m6
4577 movu [r0 + r4], m7
4578
4579 RET
4580
4581cglobal intra_pred_ang8_30, 3,5,8
4582 mov r2, r3mp
4583 lea r3, [ang_table + 14 * 16]
4584 add r1, r1
4585
4586 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4587 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4588 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
4589 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
4590
4591 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
4592 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
4593 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
4594
4595 mova m4, m3
4596 pmaddwd m4, [r3 - 1 * 16] ; [13]
4597 paddd m4, [pd_16]
4598 psrld m4, 5
4599 mova m2, m0
4600 pmaddwd m2, [r3 - 1 * 16]
4601 paddd m2, [pd_16]
4602 psrld m2, 5
4603 packusdw m4, m2
4604
4605 mova m2, m3
4606 pmaddwd m2, [r3 + 12 * 16] ; [26]
4607 paddd m2, [pd_16]
4608 psrld m2, 5
4609 mova m1, m0
4610 pmaddwd m1, [r3 + 12 * 16]
4611 paddd m1, [pd_16]
4612 psrld m1, 5
4613 packusdw m2, m1
4614
4615 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4616 mova m7, m6
4617 pmaddwd m6, [r3 - 7 * 16] ; [7]
4618 paddd m6, [pd_16]
4619 psrld m6, 5
4620 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4621 pmaddwd m1, [r3 - 7 * 16]
4622 paddd m1, [pd_16]
4623 psrld m1, 5
4624 packusdw m6, m1
4625
4626 pmaddwd m7, [r3 + 6 * 16] ; [20]
4627 paddd m7, [pd_16]
4628 psrld m7, 5
4629 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4630 pmaddwd m1, [r3 + 6 * 16]
4631 paddd m1, [pd_16]
4632 psrld m1, 5
4633 packusdw m7, m1
4634
4635 lea r4, [r1 * 3]
4636 movu [r0], m4
4637 movu [r0 + r1], m2
4638 movu [r0 + r1 * 2], m6
4639 movu [r0 + r4], m7
4640
4641 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4642 mova m6, m4
4643 pmaddwd m4, [r3 - 13 * 16] ; [1]
4644 paddd m4, [pd_16]
4645 psrld m4, 5
4646 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4647 mova m7, m2
4648 pmaddwd m2, [r3 - 13 * 16]
4649 paddd m2, [pd_16]
4650 psrld m2, 5
4651 packusdw m4, m2
4652
4653 pmaddwd m2, m6, [r3] ; [14]
4654 paddd m2, [pd_16]
4655 psrld m2, 5
4656 pmaddwd m1, m7, [r3]
4657 paddd m1, [pd_16]
4658 psrld m1, 5
4659 packusdw m2, m1
4660
4661 pmaddwd m6, [r3 + 13 * 16] ; [27]
4662 paddd m6, [pd_16]
4663 psrld m6, 5
4664 pmaddwd m7, [r3 + 13 * 16]
4665 paddd m7, [pd_16]
4666 psrld m7, 5
4667 packusdw m6, m7
4668
4669 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
4670 pmaddwd m7, [r3 - 6 * 16] ; [8]
4671 paddd m7, [pd_16]
4672 psrld m7, 5
4673 palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8]
4674 pmaddwd m5, [r3 - 6 * 16]
4675 paddd m5, [pd_16]
4676 psrld m5, 5
4677 packusdw m7, m5
4678
4679 lea r0, [r0 + r1 * 4]
4680 movu [r0], m4
4681 movu [r0 + r1], m2
4682 movu [r0 + r1 * 2], m6
4683 movu [r0 + r4], m7
4684
4685 RET
4686
4687cglobal intra_pred_ang8_31, 3,5,8
4688 mov r2, r3mp
4689 lea r3, [ang_table + 13 * 16]
4690 add r1, r1
4691
4692 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4693 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4694 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
4695 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
4696
4697 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
4698 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
4699 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
4700
4701 mova m4, m3
4702 pmaddwd m4, [r3 + 4 * 16] ; [17]
4703 paddd m4, [pd_16]
4704 psrld m4, 5
4705 mova m2, m0
4706 pmaddwd m2, [r3 + 4 * 16]
4707 paddd m2, [pd_16]
4708 psrld m2, 5
4709 packusdw m4, m2
4710
4711 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4712 mova m6, m2
4713 pmaddwd m2, [r3 - 11 * 16] ; [2]
4714 paddd m2, [pd_16]
4715 psrld m2, 5
4716 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4717 mova m7, m1
4718 pmaddwd m1, [r3 - 11 * 16]
4719 paddd m1, [pd_16]
4720 psrld m1, 5
4721 packusdw m2, m1
4722
4723 pmaddwd m6, [r3 + 6 * 16] ; [19]
4724 paddd m6, [pd_16]
4725 psrld m6, 5
4726 pmaddwd m7, [r3 + 6 * 16]
4727 paddd m7, [pd_16]
4728 psrld m7, 5
4729 packusdw m6, m7
4730
4731 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4732 pmaddwd m7, [r3 - 9 * 16] ; [4]
4733 paddd m7, [pd_16]
4734 psrld m7, 5
4735 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4736 pmaddwd m1, [r3 - 9 * 16]
4737 paddd m1, [pd_16]
4738 psrld m1, 5
4739 packusdw m7, m1
4740
4741 lea r4, [r1 * 3]
4742 movu [r0], m4
4743 movu [r0 + r1], m2
4744 movu [r0 + r1 * 2], m6
4745 movu [r0 + r4], m7
4746
4747 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4748 pmaddwd m4, [r3 + 8 * 16] ; [21]
4749 paddd m4, [pd_16]
4750 psrld m4, 5
4751 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4752 pmaddwd m2, [r3 + 8 * 16]
4753 paddd m2, [pd_16]
4754 psrld m2, 5
4755 packusdw m4, m2
4756
4757 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
4758 mova m6, m2
4759 pmaddwd m2, [r3 - 7 * 16] ; [6]
4760 paddd m2, [pd_16]
4761 psrld m2, 5
4762 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
4763 mova m7, m1
4764 pmaddwd m1, [r3 - 7 * 16]
4765 paddd m1, [pd_16]
4766 psrld m1, 5
4767 packusdw m2, m1
4768
4769 pmaddwd m6, [r3 + 10 * 16] ; [23]
4770 paddd m6, [pd_16]
4771 psrld m6, 5
4772 pmaddwd m7, [r3 + 10 * 16]
4773 paddd m7, [pd_16]
4774 psrld m7, 5
4775 packusdw m6, m7
4776
4777 mova m7, m0
4778 pmaddwd m7, [r3 - 5 * 16] ; [8]
4779 paddd m7, [pd_16]
4780 psrld m7, 5
4781 mova m1, m5
4782 pmaddwd m1, [r3 - 5 * 16]
4783 paddd m1, [pd_16]
4784 psrld m1, 5
4785 packusdw m7, m1
4786
4787 lea r0, [r0 + r1 * 4]
4788 movu [r0], m4
4789 movu [r0 + r1], m2
4790 movu [r0 + r1 * 2], m6
4791 movu [r0 + r4], m7
4792
4793 RET
4794
4795cglobal intra_pred_ang8_32, 3,6,8
4796 mov r2, r3mp
4797 lea r3, [ang_table + 19 * 16]
4798 add r1, r1
4799
4800 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4801 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4802 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
4803 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
4804
4805 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
4806 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
4807 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
4808
4809 mova m4, m3
4810 pmaddwd m4, [r3 + 2 * 16] ; [21]
4811 paddd m4, [pd_16]
4812 psrld m4, 5
4813 mova m2, m0
4814 pmaddwd m2, [r3 + 2 * 16]
4815 paddd m2, [pd_16]
4816 psrld m2, 5
4817 packusdw m4, m2
4818
4819 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4820 mova m6, m2
4821 pmaddwd m2, [r3 - 9 * 16] ; [10]
4822 paddd m2, [pd_16]
4823 psrld m2, 5
4824 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4825 mova m7, m1
4826 pmaddwd m1, [r3 - 9 * 16]
4827 paddd m1, [pd_16]
4828 psrld m1, 5
4829 packusdw m2, m1
4830
4831 pmaddwd m6, [r3 + 12 * 16] ; [31]
4832 paddd m6, [pd_16]
4833 psrld m6, 5
4834 pmaddwd m7, [r3 + 12 * 16]
4835 paddd m7, [pd_16]
4836 psrld m7, 5
4837 packusdw m6, m7
4838
4839 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4840 pmaddwd m7, [r3 + 1 * 16] ; [20]
4841 paddd m7, [pd_16]
4842 psrld m7, 5
4843 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4844 pmaddwd m1, [r3 + 1 * 16]
4845 paddd m1, [pd_16]
4846 psrld m1, 5
4847 packusdw m7, m1
4848
4849 lea r4, [r1 * 3]
4850 movu [r0], m4
4851 movu [r0 + r1], m2
4852 movu [r0 + r1 * 2], m6
4853 movu [r0 + r4], m7
4854
4855 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
4856 mova m2, m4
4857 pmaddwd m4, [r3 - 10 * 16] ; [ 9]
4858 paddd m4, [pd_16]
4859 psrld m4, 5
4860 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
4861 mova m6, m3
4862 pmaddwd m3, [r3 - 10 * 16]
4863 paddd m3, [pd_16]
4864 psrld m3, 5
4865 packusdw m4, m3
4866
4867 pmaddwd m2, [r3 + 11 * 16] ; [30]
4868 paddd m2, [pd_16]
4869 psrld m2, 5
4870 pmaddwd m6, [r3 + 11 * 16]
4871 paddd m6, [pd_16]
4872 psrld m6, 5
4873 packusdw m2, m6
4874
4875 mova m6, m0
4876 pmaddwd m6, [r3] ; [19]
4877 paddd m6, [pd_16]
4878 psrld m6, 5
4879 mova m7, m5
4880 pmaddwd m7, [r3]
4881 paddd m7, [pd_16]
4882 psrld m7, 5
4883 packusdw m6, m7
4884
4885 movh m1, [r2 + 26] ; [16 15 14 13]
4886 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4887 pmaddwd m7, [r3 - 11 * 16] ; [8]
4888 paddd m7, [pd_16]
4889 psrld m7, 5
4890 palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10]
4891 pmaddwd m1, [r3 - 11 * 16]
4892 paddd m1, [pd_16]
4893 psrld m1, 5
4894 packusdw m7, m1
4895
4896 lea r0, [r0 + r1 * 4]
4897 movu [r0], m4
4898 movu [r0 + r1], m2
4899 movu [r0 + r1 * 2], m6
4900 movu [r0 + r4], m7
4901
4902 RET
4903
4904cglobal intra_pred_ang8_33, 3,5,8
4905 mov r2, r3mp
4906 lea r3, [ang_table + 14 * 16]
4907 add r1, r1
4908
4909 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4910 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4911 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
4912 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
4913
4914 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
4915 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
4916 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
4917 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
4918
4919 mova m4, m3
4920 pmaddwd m4, [r3 + 12 * 16] ; [26]
4921 paddd m4, [pd_16]
4922 psrld m4, 5
4923 mova m2, m0
4924 pmaddwd m2, [r3 + 12 * 16]
4925 paddd m2, [pd_16]
4926 psrld m2, 5
4927 packusdw m4, m2
4928
4929 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4930 pmaddwd m2, [r3 + 6 * 16] ; [20]
4931 paddd m2, [pd_16]
4932 psrld m2, 5
4933 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4934 pmaddwd m6, [r3 + 6 * 16]
4935 paddd m6, [pd_16]
4936 psrld m6, 5
4937 packusdw m2, m6
4938
4939 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4940 pmaddwd m6, [r3] ; [14]
4941 paddd m6, [pd_16]
4942 psrld m6, 5
4943 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4944 pmaddwd m7, [r3]
4945 paddd m7, [pd_16]
4946 psrld m7, 5
4947 packusdw m6, m7
4948
4949 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
4950 pmaddwd m7, [r3 - 6 * 16] ; [ 8]
4951 paddd m7, [pd_16]
4952 psrld m7, 5
4953 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
4954 pmaddwd m3, [r3 - 6 * 16]
4955 paddd m3, [pd_16]
4956 psrld m3, 5
4957 packusdw m7, m3
4958
4959 lea r4, [r1 * 3]
4960 movu [r0], m4
4961 movu [r0 + r1], m2
4962 movu [r0 + r1 * 2], m6
4963 movu [r0 + r4], m7
4964
4965 mova m4, m0
4966 pmaddwd m4, [r3 - 12 * 16] ; [ 2]
4967 paddd m4, [pd_16]
4968 psrld m4, 5
4969 mova m2, m5
4970 pmaddwd m2, [r3 - 12 * 16]
4971 paddd m2, [pd_16]
4972 psrld m2, 5
4973 packusdw m4, m2
4974
4975 mova m2, m0
4976 pmaddwd m2, [r3 + 14 * 16] ; [28]
4977 paddd m2, [pd_16]
4978 psrld m2, 5
4979 mova m6, m5
4980 pmaddwd m6, [r3 + 14 * 16]
4981 paddd m6, [pd_16]
4982 psrld m6, 5
4983 packusdw m2, m6
4984
4985 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4986 pmaddwd m6, [r3 + 8 * 16] ; [22]
4987 paddd m6, [pd_16]
4988 psrld m6, 5
4989 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
4990 pmaddwd m7, [r3 + 8 * 16]
4991 paddd m7, [pd_16]
4992 psrld m7, 5
4993 packusdw m6, m7
4994
4995 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4996 pmaddwd m7, [r3 + 2 * 16] ; [16]
4997 paddd m7, [pd_16]
4998 psrld m7, 5
4999 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
5000 pmaddwd m1, [r3 + 2 * 16]
5001 paddd m1, [pd_16]
5002 psrld m1, 5
5003 packusdw m7, m1
5004
5005 lea r0, [r0 + r1 * 4]
5006 movu [r0], m4
5007 movu [r0 + r1], m2
5008 movu [r0 + r1 * 2], m6
5009 movu [r0 + r4], m7
5010
5011 RET
5012
5013;-----------------------------------------------------------------------------
5014; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
5015;-----------------------------------------------------------------------------
5016INIT_XMM ssse3
5017cglobal intra_pred_ang16_2, 3,4,5
5018 cmp r4m, byte 34
5019 cmove r2, r3mp
5020 add r1, r1
5021 lea r3, [r1 * 3]
5022 movu m0, [r2 + 4]
5023 movu m1, [r2 + 20]
5024 movu m2, [r2 + 36]
5025
5026 movu [r0], m0
5027 movu [r0 + 16], m1
5028 palignr m3, m1, m0, 2
5029 palignr m4, m2, m1, 2
5030 movu [r0 + r1], m3
5031 movu [r0 + r1 + 16], m4
5032 palignr m3, m1, m0, 4
5033 palignr m4, m2, m1, 4
5034 movu [r0 + r1 * 2], m3
5035 movu [r0 + r1 * 2 + 16], m4
5036 palignr m3, m1, m0, 6
5037 palignr m4, m2, m1, 6
5038 movu [r0 + r3], m3
5039 movu [r0 + r3 + 16], m4
5040
5041 lea r0, [r0 + r1 * 4]
5042 palignr m3, m1, m0, 8
5043 palignr m4, m2, m1, 8
5044 movu [r0], m3
5045 movu [r0 + 16], m4
5046 palignr m3, m1, m0, 10
5047 palignr m4, m2, m1, 10
5048 movu [r0 + r1], m3
5049 movu [r0 + r1 + 16], m4
5050 palignr m3, m1, m0, 12
5051 palignr m4, m2, m1, 12
5052 movu [r0 + r1 * 2], m3
5053 movu [r0 + r1 * 2 + 16], m4
5054 palignr m3, m1, m0, 14
5055 palignr m4, m2, m1, 14
5056 movu [r0 + r3], m3
5057 movu [r0 + r3 + 16], m4
5058
5059 movu m0, [r2 + 52]
5060 lea r0, [r0 + r1 * 4]
5061 movu [r0], m1
5062 movu [r0 + 16], m2
5063 palignr m3, m2, m1, 2
5064 palignr m4, m0, m2, 2
5065 movu [r0 + r1], m3
5066 movu [r0 + r1 + 16], m4
5067 palignr m3, m2, m1, 4
5068 palignr m4, m0, m2, 4
5069 movu [r0 + r1 * 2], m3
5070 movu [r0 + r1 * 2 + 16], m4
5071 palignr m3, m2, m1, 6
5072 palignr m4, m0, m2, 6
5073 movu [r0 + r3], m3
5074 movu [r0 + r3 + 16], m4
5075
5076 lea r0, [r0 + r1 * 4]
5077 palignr m3, m2, m1, 8
5078 palignr m4, m0, m2, 8
5079 movu [r0], m3
5080 movu [r0 + 16], m4
5081 palignr m3, m2, m1, 10
5082 palignr m4, m0, m2, 10
5083 movu [r0 + r1], m3
5084 movu [r0 + r1 + 16], m4
5085 palignr m3, m2, m1, 12
5086 palignr m4, m0, m2, 12
5087 movu [r0 + r1 * 2], m3
5088 movu [r0 + r1 * 2 + 16], m4
5089 palignr m3, m2, m1, 14
5090 palignr m4, m0, m2, 14
5091 movu [r0 + r3], m3
5092 movu [r0 + r3 + 16], m4
5093
5094 RET
5095
5096%macro TRANSPOSE_STORE 6
5097 jnz .skip%6
5098 punpckhwd %5, %1, %2
5099 punpcklwd %1, %2
5100 punpckhwd %2, %3, %4
5101 punpcklwd %3, %4
5102
5103 punpckldq %4, %1, %3
5104 punpckhdq %1, %3
5105 punpckldq %3, %5, %2
5106 punpckhdq %5, %2
5107
5108 movh [r0 + %6], %4
5109 movhps [r0 + r1 + %6], %4
5110 movh [r0 + r1 * 2 + %6], %1
5111 movhps [r0 + r4 + %6], %1
5112 lea r5, [r0 + r1 * 4]
5113 movh [r5 + %6], %3
5114 movhps [r5 + r1 + %6], %3
5115 movh [r5 + r1 * 2 + %6], %5
5116 movhps [r5 + r4 + %6], %5
5117 jmp .end%6
5118
5119.skip%6:
5120 movu [r5], %1
5121 movu [r5 + r1], %2
5122 movu [r5 + r1 * 2], %3
5123 movu [r5 + r4], %4
5124.end%6:
5125%endmacro
5126
5127INIT_XMM sse4
5128cglobal ang16_mode_3_33
5129 test r6d, r6d
5130 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5131 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
5132 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
5133 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
5134
5135 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
5136 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
5137 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
5138 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
5139
5140 mova m4, m3
5141 pmaddwd m4, [r3 + 10 * 16] ; [26]
5142 paddd m4, [pd_16]
5143 psrld m4, 5
5144 mova m2, m0
5145 pmaddwd m2, [r3 + 10 * 16]
5146 paddd m2, [pd_16]
5147 psrld m2, 5
5148 packusdw m4, m2
5149
5150 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
5151 pmaddwd m2, [r3 + 4 * 16] ; [20]
5152 paddd m2, [pd_16]
5153 psrld m2, 5
5154 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5155 pmaddwd m6, [r3 + 4 * 16]
5156 paddd m6, [pd_16]
5157 psrld m6, 5
5158 packusdw m2, m6
5159
5160 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
5161 pmaddwd m6, [r3 - 2 * 16] ; [14]
5162 paddd m6, [pd_16]
5163 psrld m6, 5
5164 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5165 pmaddwd m7, [r3 - 2 * 16]
5166 paddd m7, [pd_16]
5167 psrld m7, 5
5168 packusdw m6, m7
5169
5170 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
5171 pmaddwd m7, [r3 - 8 * 16] ; [ 8]
5172 paddd m7, [pd_16]
5173 psrld m7, 5
5174 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5175 pmaddwd m3, [r3 - 8 * 16]
5176 paddd m3, [pd_16]
5177 psrld m3, 5
5178 packusdw m7, m3
5179
5180 mov r5, r0
5181
5182 TRANSPOSE_STORE m4, m2, m6, m7, m3, 0
5183
5184 mova m4, m0
5185 pmaddwd m4, [r3 - 14 * 16] ; [ 2]
5186 paddd m4, [pd_16]
5187 psrld m4, 5
5188 mova m2, m5
5189 pmaddwd m2, [r3 - 14 * 16]
5190 paddd m2, [pd_16]
5191 psrld m2, 5
5192 packusdw m4, m2
5193
5194 mova m2, m0
5195 pmaddwd m2, [r3 + 12 * 16] ; [28]
5196 paddd m2, [pd_16]
5197 psrld m2, 5
5198 mova m6, m5
5199 pmaddwd m6, [r3 + 12 * 16]
5200 paddd m6, [pd_16]
5201 psrld m6, 5
5202 packusdw m2, m6
5203
5204 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5205 pmaddwd m6, [r3 + 6 * 16] ; [22]
5206 paddd m6, [pd_16]
5207 psrld m6, 5
5208 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
5209 pmaddwd m7, [r3 + 6 * 16]
5210 paddd m7, [pd_16]
5211 psrld m7, 5
5212 packusdw m6, m7
5213
5214 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5215 pmaddwd m7, [r3] ; [16]
5216 paddd m7, [pd_16]
5217 psrld m7, 5
5218 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
5219 pmaddwd m1, [r3]
5220 paddd m1, [pd_16]
5221 psrld m1, 5
5222 packusdw m7, m1
5223
5224 lea r5, [r0 + r1 * 4]
5225
5226 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8
5227
5228 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13]
5229 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14]
5230
5231 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13]
5232 punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17]
5233
5234 palignr m4, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5235 pmaddwd m4, [r3 - 6 * 16] ; [10]
5236 paddd m4, [pd_16]
5237 psrld m4, 5
5238 palignr m2, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
5239 pmaddwd m2, [r3 - 6 * 16]
5240 paddd m2, [pd_16]
5241 psrld m2, 5
5242 packusdw m4, m2
5243
5244 mova m2, m5
5245 pmaddwd m2, [r3 - 12 * 16] ; [4]
5246 paddd m2, [pd_16]
5247 psrld m2, 5
5248 mova m6, m3
5249 pmaddwd m6, [r3 - 12 * 16]
5250 paddd m6, [pd_16]
5251 psrld m6, 5
5252 packusdw m2, m6
5253
5254 mova m6, m5
5255 pmaddwd m6, [r3 + 14 * 16] ; [30]
5256 paddd m6, [pd_16]
5257 psrld m6, 5
5258 mova m7, m3
5259 pmaddwd m7, [r3 + 14 * 16]
5260 paddd m7, [pd_16]
5261 psrld m7, 5
5262 packusdw m6, m7
5263
5264 palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
5265 pmaddwd m7, [r3 + 8 * 16] ; [24]
5266 paddd m7, [pd_16]
5267 psrld m7, 5
5268 palignr m0, m1, m3, 4 ; [18 17 17 16 16 15 15 14]
5269 pmaddwd m0, [r3 + 8 * 16]
5270 paddd m0, [pd_16]
5271 psrld m0, 5
5272 packusdw m7, m0
5273
5274 lea r5, [r5 + r1 * 4]
5275
5276 TRANSPOSE_STORE m4, m2, m6, m7, m0, 16
5277
5278 palignr m4, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
5279 pmaddwd m4, [r3 + 2 * 16] ; [18]
5280 paddd m4, [pd_16]
5281 psrld m4, 5
5282 palignr m2, m1, m3, 8 ; [19 18 18 17 17 16 16 15]
5283 pmaddwd m2, [r3 + 2 * 16]
5284 paddd m2, [pd_16]
5285 psrld m2, 5
5286 packusdw m4, m2
5287
5288 palignr m2, m3, m5, 12 ; [16 15 15 14 14 13 13 12]
5289 pmaddwd m2, [r3 - 4 * 16] ; [12]
5290 paddd m2, [pd_16]
5291 psrld m2, 5
5292 palignr m6, m1, m3, 12 ; [20 19 19 18 18 17 17 16]
5293 pmaddwd m6, [r3 - 4 * 16]
5294 paddd m6, [pd_16]
5295 psrld m6, 5
5296 packusdw m2, m6
5297
5298 pinsrw m1, [r2 + 42], 7
5299 pmaddwd m3, [r3 - 10 * 16] ; [6]
5300 paddd m3, [pd_16]
5301 psrld m3, 5
5302 pmaddwd m1, [r3 - 10 * 16]
5303 paddd m1, [pd_16]
5304 psrld m1, 5
5305 packusdw m3, m1
5306
5307 movu m7, [r2 + 28]
5308
5309 lea r5, [r5 + r1 * 4]
5310
5311 TRANSPOSE_STORE m4, m2, m3, m7, m0, 24
5312
5313 ret
5314
5315cglobal intra_pred_ang16_3, 3,7,8
5316 xor r6d, r6d
5317 lea r3, [ang_table + 16 * 16]
5318 add r1, r1
5319 lea r4, [r1 * 3]
5320
5321 call ang16_mode_3_33
5322
5323 lea r2, [r2 + 16]
5324 lea r0, [r0 + r1 * 8]
5325
5326 call ang16_mode_3_33
5327
5328 RET
5329
5330cglobal intra_pred_ang16_33, 4,7,8
5331 xor r6d, r6d
5332 inc r6d
5333 mov r2, r3
5334 lea r3, [ang_table + 16 * 16]
5335 add r1, r1
5336 lea r4, [r1 * 3]
5337
5338 call ang16_mode_3_33
5339
5340 lea r2, [r2 + 16]
5341 lea r0, [r0 + 16]
5342
5343 call ang16_mode_3_33
5344
5345 RET
5346
5347cglobal ang16_mode_4_32
5348 test r6d, r6d
5349 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5350 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
5351 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
5352 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
5353
5354 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
5355 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
5356 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
5357
5358 mova m4, m3
5359 pmaddwd m4, [r3 + 3 * 16] ; [21]
5360 paddd m4, [pd_16]
5361 psrld m4, 5
5362 mova m2, m0
5363 pmaddwd m2, [r3 + 3 * 16]
5364 paddd m2, [pd_16]
5365 psrld m2, 5
5366 packusdw m4, m2
5367
5368 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
5369 mova m6, m2
5370 pmaddwd m2, [r3 - 8 * 16] ; [10]
5371 paddd m2, [pd_16]
5372 psrld m2, 5
5373 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5374 mova m7, m1
5375 pmaddwd m1, [r3 - 8 * 16]
5376 paddd m1, [pd_16]
5377 psrld m1, 5
5378 packusdw m2, m1
5379
5380 pmaddwd m6, [r3 + 13 * 16] ; [31]
5381 paddd m6, [pd_16]
5382 psrld m6, 5
5383 pmaddwd m7, [r3 + 13 * 16]
5384 paddd m7, [pd_16]
5385 psrld m7, 5
5386 packusdw m6, m7
5387
5388 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
5389 pmaddwd m7, [r3 + 2 * 16] ; [20]
5390 paddd m7, [pd_16]
5391 psrld m7, 5
5392 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5393 pmaddwd m1, [r3 + 2 * 16]
5394 paddd m1, [pd_16]
5395 psrld m1, 5
5396 packusdw m7, m1
5397
5398 mov r5, r0
5399
5400 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
5401
5402 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
5403 mova m2, m4
5404 pmaddwd m4, [r3 - 9 * 16] ; [9]
5405 paddd m4, [pd_16]
5406 psrld m4, 5
5407 palignr m7, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5408 mova m6, m7
5409 pmaddwd m7, [r3 - 9 * 16]
5410 paddd m7, [pd_16]
5411 psrld m7, 5
5412 packusdw m4, m7
5413
5414 pmaddwd m2, [r3 + 12 * 16] ; [30]
5415 paddd m2, [pd_16]
5416 psrld m2, 5
5417 pmaddwd m6, [r3 + 12 * 16]
5418 paddd m6, [pd_16]
5419 psrld m6, 5
5420 packusdw m2, m6
5421
5422 mova m6, m0
5423 pmaddwd m6, [r3 + 1 * 16] ; [19]
5424 paddd m6, [pd_16]
5425 psrld m6, 5
5426 mova m7, m5
5427 pmaddwd m7, [r3 + 1 * 16]
5428 paddd m7, [pd_16]
5429 psrld m7, 5
5430 packusdw m6, m7
5431
5432 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13]
5433
5434 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5435 pmaddwd m7, [r3 - 10 * 16] ; [8]
5436 paddd m7, [pd_16]
5437 psrld m7, 5
5438 palignr m3, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
5439 pmaddwd m3, [r3 - 10 * 16]
5440 paddd m3, [pd_16]
5441 psrld m3, 5
5442 packusdw m7, m3
5443
5444 lea r5, [r0 + r1 * 4]
5445
5446 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8
5447
5448 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14]
5449
5450 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13]
5451 punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17]
5452
5453 palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5454 pmaddwd m4, [r3 + 11 * 16] ; [29]
5455 paddd m4, [pd_16]
5456 psrld m4, 5
5457 palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
5458 pmaddwd m2, [r3 + 11 * 16]
5459 paddd m2, [pd_16]
5460 psrld m2, 5
5461 packusdw m4, m2
5462
5463 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5464 pmaddwd m2, [r3] ; [18]
5465 paddd m2, [pd_16]
5466 psrld m2, 5
5467 palignr m6, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
5468 pmaddwd m6, [r3]
5469 paddd m6, [pd_16]
5470 psrld m6, 5
5471 packusdw m2, m6
5472
5473 palignr m6, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5474 mova m7, m6
5475 pmaddwd m6, [r3 - 11 * 16] ; [7]
5476 paddd m6, [pd_16]
5477 psrld m6, 5
5478 palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
5479 pmaddwd m0, [r3 - 11 * 16]
5480 paddd m0, [pd_16]
5481 psrld m0, 5
5482 packusdw m6, m0
5483
5484 pmaddwd m7, [r3 + 10 * 16] ; [28]
5485 paddd m7, [pd_16]
5486 psrld m7, 5
5487 palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
5488 pmaddwd m0, [r3 + 10 * 16]
5489 paddd m0, [pd_16]
5490 psrld m0, 5
5491 packusdw m7, m0
5492
5493 lea r5, [r5 + r1 * 4]
5494
5495 TRANSPOSE_STORE m4, m2, m6, m7, m0, 16
5496
5497 mova m4, m5
5498 pmaddwd m4, [r3 - 1 * 16] ; [17]
5499 paddd m4, [pd_16]
5500 psrld m4, 5
5501 mova m2, m3
5502 pmaddwd m2, [r3 - 1 * 16]
5503 paddd m2, [pd_16]
5504 psrld m2, 5
5505 packusdw m4, m2
5506
5507 palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
5508 mova m7, m2
5509 pmaddwd m2, [r3 - 12 * 16] ; [6]
5510 paddd m2, [pd_16]
5511 psrld m2, 5
5512 palignr m6, m1, m3, 4 ; [18 17 17 16 16 15 15 14]
5513 mova m0, m6
5514 pmaddwd m6, [r3 - 12 * 16]
5515 paddd m6, [pd_16]
5516 psrld m6, 5
5517 packusdw m2, m6
5518
5519 pmaddwd m7, [r3 + 9 * 16] ; [27]
5520 paddd m7, [pd_16]
5521 psrld m7, 5
5522 pmaddwd m0, [r3 + 9 * 16]
5523 paddd m0, [pd_16]
5524 psrld m0, 5
5525 packusdw m7, m0
5526
5527 palignr m0, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
5528 pmaddwd m0, [r3 - 2 * 16] ; [16]
5529 paddd m0, [pd_16]
5530 psrld m0, 5
5531 palignr m1, m3, 8 ; [19 18 18 17 17 16 16 15]
5532 pmaddwd m1, [r3 - 2 * 16]
5533 paddd m1, [pd_16]
5534 psrld m1, 5
5535 packusdw m0, m1
5536
5537 lea r5, [r5 + r1 * 4]
5538
5539 TRANSPOSE_STORE m4, m2, m7, m0, m3, 24
5540
5541 ret
5542
5543cglobal intra_pred_ang16_4, 3,7,8
5544 xor r6d, r6d
5545 lea r3, [ang_table + 18 * 16]
5546 add r1, r1
5547 lea r4, [r1 * 3]
5548
5549 call ang16_mode_4_32
5550
5551 lea r2, [r2 + 16]
5552 lea r0, [r0 + r1 * 8]
5553
5554 call ang16_mode_4_32
5555
5556 RET
5557
5558cglobal intra_pred_ang16_32, 4,7,8
5559 xor r6d, r6d
5560 inc r6d
5561 mov r2, r3
5562 lea r3, [ang_table + 18 * 16]
5563 add r1, r1
5564 lea r4, [r1 * 3]
5565
5566 call ang16_mode_4_32
5567
5568 lea r2, [r2 + 16]
5569 lea r0, [r0 + 16]
5570
5571 call ang16_mode_4_32
5572
5573 RET
5574
5575cglobal ang16_mode_5_31
5576 test r6d, r6d
5577 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5578 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
5579 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
5580 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
5581
5582 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
5583 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
5584 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
5585
5586 mova m4, m3
5587 pmaddwd m4, [r3 + 1 * 16] ; [17]
5588 paddd m4, [pd_16]
5589 psrld m4, 5
5590 mova m2, m0
5591 pmaddwd m2, [r3 + 1 * 16]
5592 paddd m2, [pd_16]
5593 psrld m2, 5
5594 packusdw m4, m2
5595
5596 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
5597 mova m6, m2
5598 pmaddwd m2, [r3 - 14 * 16] ; [2]
5599 paddd m2, [pd_16]
5600 psrld m2, 5
5601 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5602 mova m7, m1
5603 pmaddwd m1, [r3 - 14 * 16]
5604 paddd m1, [pd_16]
5605 psrld m1, 5
5606 packusdw m2, m1
5607
5608 pmaddwd m6, [r3 + 3 * 16] ; [19]
5609 paddd m6, [pd_16]
5610 psrld m6, 5
5611 pmaddwd m7, [r3 + 3 * 16]
5612 paddd m7, [pd_16]
5613 psrld m7, 5
5614 packusdw m6, m7
5615
5616 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
5617 pmaddwd m7, [r3 - 12 * 16] ; [4]
5618 paddd m7, [pd_16]
5619 psrld m7, 5
5620 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5621 pmaddwd m1, [r3 - 12 * 16]
5622 paddd m1, [pd_16]
5623 psrld m1, 5
5624 packusdw m7, m1
5625
5626 mov r5, r0
5627
5628 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
5629
5630 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
5631 pmaddwd m4, [r3 + 5 * 16] ; [21]
5632 paddd m4, [pd_16]
5633 psrld m4, 5
5634 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5635 pmaddwd m7, [r3 + 5 * 16]
5636 paddd m7, [pd_16]
5637 psrld m7, 5
5638 packusdw m4, m7
5639
5640 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
5641 mova m6, m2
5642 pmaddwd m2, [r3 - 10 * 16] ; [6]
5643 paddd m2, [pd_16]
5644 psrld m2, 5
5645 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5646 mova m7, m1
5647 pmaddwd m1, [r3 - 10 * 16]
5648 paddd m1, [pd_16]
5649 psrld m1, 5
5650 packusdw m2, m1
5651
5652 pmaddwd m6, [r3 + 7 * 16] ; [23]
5653 paddd m6, [pd_16]
5654 psrld m6, 5
5655 pmaddwd m7, [r3 + 7 * 16]
5656 paddd m7, [pd_16]
5657 psrld m7, 5
5658 packusdw m6, m7
5659
5660 mova m7, m0
5661 pmaddwd m7, [r3 - 8 * 16] ; [8]
5662 paddd m7, [pd_16]
5663 psrld m7, 5
5664 mova m3, m5
5665 pmaddwd m3, [r3 - 8 * 16]
5666 paddd m3, [pd_16]
5667 psrld m3, 5
5668 packusdw m7, m3
5669
5670 lea r5, [r0 + r1 * 4]
5671
5672 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8
5673
5674 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13]
5675 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14]
5676
5677 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13]
5678
5679 mova m4, m0
5680 pmaddwd m4, [r3 + 9 * 16] ; [25]
5681 paddd m4, [pd_16]
5682 psrld m4, 5
5683 mova m2, m5
5684 pmaddwd m2, [r3 + 9 * 16]
5685 paddd m2, [pd_16]
5686 psrld m2, 5
5687 packusdw m4, m2
5688
5689 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5690 mova m6, m2
5691 pmaddwd m2, [r3 - 6 * 16] ; [10]
5692 paddd m2, [pd_16]
5693 psrld m2, 5
5694 palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
5695 mova m1, m7
5696 pmaddwd m7, [r3 - 6 * 16]
5697 paddd m7, [pd_16]
5698 psrld m7, 5
5699 packusdw m2, m7
5700
5701 pmaddwd m6, [r3 + 11 * 16] ; [27]
5702 paddd m6, [pd_16]
5703 psrld m6, 5
5704 pmaddwd m1, [r3 + 11 * 16]
5705 paddd m1, [pd_16]
5706 psrld m1, 5
5707 packusdw m6, m1
5708
5709 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5710 pmaddwd m7, [r3 - 4 * 16] ; [12]
5711 paddd m7, [pd_16]
5712 psrld m7, 5
5713 palignr m1, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
5714 pmaddwd m1, [r3 - 4 * 16]
5715 paddd m1, [pd_16]
5716 psrld m1, 5
5717 packusdw m7, m1
5718
5719 lea r5, [r5 + r1 * 4]
5720
5721 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
5722
5723 palignr m4, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5724 pmaddwd m4, [r3 + 13 * 16] ; [29]
5725 paddd m4, [pd_16]
5726 psrld m4, 5
5727 palignr m2, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
5728 pmaddwd m2, [r3 + 13 * 16]
5729 paddd m2, [pd_16]
5730 psrld m2, 5
5731 packusdw m4, m2
5732
5733 palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5734 mova m7, m2
5735 pmaddwd m2, [r3 - 2 * 16] ; [14]
5736 paddd m2, [pd_16]
5737 psrld m2, 5
5738 palignr m6, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
5739 mova m0, m6
5740 pmaddwd m6, [r3 - 2 * 16]
5741 paddd m6, [pd_16]
5742 psrld m6, 5
5743 packusdw m2, m6
5744
5745 pmaddwd m7, [r3 + 15 * 16] ; [31]
5746 paddd m7, [pd_16]
5747 psrld m7, 5
5748 pmaddwd m0, [r3 + 15 * 16]
5749 paddd m0, [pd_16]
5750 psrld m0, 5
5751 packusdw m7, m0
5752
5753 pmaddwd m5, [r3] ; [16]
5754 paddd m5, [pd_16]
5755 psrld m5, 5
5756 pmaddwd m3, [r3]
5757 paddd m3, [pd_16]
5758 psrld m3, 5
5759 packusdw m5, m3
5760
5761 lea r5, [r5 + r1 * 4]
5762
5763 TRANSPOSE_STORE m4, m2, m7, m5, m3, 24
5764
5765 ret
5766
5767cglobal intra_pred_ang16_5, 3,7,8
5768 xor r6d, r6d
5769 lea r3, [ang_table + 16 * 16]
5770 add r1, r1
5771 lea r4, [r1 * 3]
5772
5773 call ang16_mode_5_31
5774
5775 lea r2, [r2 + 16]
5776 lea r0, [r0 + r1 * 8]
5777
5778 call ang16_mode_5_31
5779
5780 RET
5781
5782cglobal intra_pred_ang16_31, 4,7,8
5783 xor r6d, r6d
5784 inc r6d
5785 mov r2, r3
5786 lea r3, [ang_table + 16 * 16]
5787 add r1, r1
5788 lea r4, [r1 * 3]
5789
5790 call ang16_mode_5_31
5791
5792 lea r2, [r2 + 16]
5793 lea r0, [r0 + 16]
5794
5795 call ang16_mode_5_31
5796
5797 RET
5798
5799cglobal ang16_mode_6_30
5800 test r6d, r6d
5801 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5802 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
5803 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
5804 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
5805
5806 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
5807 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
5808 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
5809
5810 mova m4, m3
5811 pmaddwd m4, [r3 - 2 * 16] ; [13]
5812 paddd m4, [pd_16]
5813 psrld m4, 5
5814 mova m2, m0
5815 pmaddwd m2, [r3 - 2 * 16]
5816 paddd m2, [pd_16]
5817 psrld m2, 5
5818 packusdw m4, m2
5819
5820 mova m2, m3
5821 pmaddwd m2, [r3 + 11 * 16] ; [26]
5822 paddd m2, [pd_16]
5823 psrld m2, 5
5824 mova m1, m0
5825 pmaddwd m1, [r3 + 11 * 16]
5826 paddd m1, [pd_16]
5827 psrld m1, 5
5828 packusdw m2, m1
5829
5830 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
5831 mova m7, m6
5832 pmaddwd m6, [r3 - 8 * 16] ; [7]
5833 paddd m6, [pd_16]
5834 psrld m6, 5
5835 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5836 pmaddwd m1, [r3 - 8 * 16]
5837 paddd m1, [pd_16]
5838 psrld m1, 5
5839 packusdw m6, m1
5840
5841 pmaddwd m7, [r3 + 5 * 16] ; [20]
5842 paddd m7, [pd_16]
5843 psrld m7, 5
5844 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5845 pmaddwd m1, [r3 + 5 * 16]
5846 paddd m1, [pd_16]
5847 psrld m1, 5
5848 packusdw m7, m1
5849
5850 mov r5, r0
5851
5852 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
5853
5854 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
5855 mova m6, m4
5856 pmaddwd m4, [r3 - 14 * 16] ; [1]
5857 paddd m4, [pd_16]
5858 psrld m4, 5
5859 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5860 mova m7, m1
5861 pmaddwd m1, [r3 - 14 * 16]
5862 paddd m1, [pd_16]
5863 psrld m1, 5
5864 packusdw m4, m1
5865
5866 mova m2, m6
5867 pmaddwd m2, [r3 - 1 * 16] ; [14]
5868 paddd m2, [pd_16]
5869 psrld m2, 5
5870 mova m1, m7
5871 pmaddwd m1, [r3 - 1 * 16]
5872 paddd m1, [pd_16]
5873 psrld m1, 5
5874 packusdw m2, m1
5875
5876 pmaddwd m6, [r3 + 12 * 16] ; [27]
5877 paddd m6, [pd_16]
5878 psrld m6, 5
5879 pmaddwd m7, [r3 + 12 * 16]
5880 paddd m7, [pd_16]
5881 psrld m7, 5
5882 packusdw m6, m7
5883
5884 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
5885 pmaddwd m7, [r3 - 7 * 16] ; [8]
5886 paddd m7, [pd_16]
5887 psrld m7, 5
5888 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5889 pmaddwd m1, [r3 - 7 * 16]
5890 paddd m1, [pd_16]
5891 psrld m1, 5
5892 packusdw m7, m1
5893
5894 lea r5, [r0 + r1 * 4]
5895
5896 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
5897
5898 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
5899 pmaddwd m4, [r3 + 6 * 16] ; [21]
5900 paddd m4, [pd_16]
5901 psrld m4, 5
5902 palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5903 pmaddwd m2, [r3 + 6 * 16]
5904 paddd m2, [pd_16]
5905 psrld m2, 5
5906 packusdw m4, m2
5907
5908 mova m2, m0
5909 pmaddwd m2, [r3 - 13 * 16] ; [2]
5910 paddd m2, [pd_16]
5911 psrld m2, 5
5912 mova m7, m5
5913 pmaddwd m7, [r3 - 13 * 16]
5914 paddd m7, [pd_16]
5915 psrld m7, 5
5916 packusdw m2, m7
5917
5918 mova m6, m0
5919 pmaddwd m6, [r3] ; [15]
5920 paddd m6, [pd_16]
5921 psrld m6, 5
5922 mova m1, m5
5923 pmaddwd m1, [r3]
5924 paddd m1, [pd_16]
5925 psrld m1, 5
5926 packusdw m6, m1
5927
5928 mova m7, m0
5929 pmaddwd m7, [r3 + 13 * 16] ; [28]
5930 paddd m7, [pd_16]
5931 psrld m7, 5
5932 mova m1, m5
5933 pmaddwd m1, [r3 + 13 * 16]
5934 paddd m1, [pd_16]
5935 psrld m1, 5
5936 packusdw m7, m1
5937
5938 lea r5, [r5 + r1 * 4]
5939
5940 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
5941
5942 movh m3, [r2 + 26] ; [16 15 14 13]
5943
5944 palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5945 mova m2, m4
5946 pmaddwd m4, [r3 - 6 * 16] ; [9]
5947 paddd m4, [pd_16]
5948 psrld m4, 5
5949 palignr m1, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
5950 mova m6, m1
5951 pmaddwd m1, [r3 - 6 * 16]
5952 paddd m1, [pd_16]
5953 psrld m1, 5
5954 packusdw m4, m1
5955
5956 pmaddwd m2, [r3 + 7 * 16] ; [22]
5957 paddd m2, [pd_16]
5958 psrld m2, 5
5959 mova m1, m6
5960 pmaddwd m1, [r3 + 7 * 16]
5961 paddd m1, [pd_16]
5962 psrld m1, 5
5963 packusdw m2, m1
5964
5965 psrldq m3, 2
5966 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5967 mova m5, m7
5968 pmaddwd m7, [r3 - 12 * 16] ; [3]
5969 paddd m7, [pd_16]
5970 psrld m7, 5
5971 palignr m3, m6, 4 ; [15 14 14 13 13 12 12 11]
5972 mova m1, m3
5973 pmaddwd m3, [r3 - 12 * 16]
5974 paddd m3, [pd_16]
5975 psrld m3, 5
5976 packusdw m7, m3
5977
5978 pmaddwd m5, [r3 + 1 * 16] ; [16]
5979 paddd m5, [pd_16]
5980 psrld m5, 5
5981 pmaddwd m1, [r3 + 1 * 16]
5982 paddd m1, [pd_16]
5983 psrld m1, 5
5984 packusdw m5, m1
5985
5986 lea r5, [r5 + r1 * 4]
5987
5988 TRANSPOSE_STORE m4, m2, m7, m5, m3, 24
5989
5990 ret
5991
5992cglobal intra_pred_ang16_6, 3,7,8
5993 xor r6d, r6d
5994 lea r3, [ang_table + 15 * 16]
5995 add r1, r1
5996 lea r4, [r1 * 3]
5997
5998 call ang16_mode_6_30
5999
6000 lea r2, [r2 + 16]
6001 lea r0, [r0 + r1 * 8]
6002
6003 call ang16_mode_6_30
6004
6005 RET
6006
6007cglobal intra_pred_ang16_30, 4,7,8
6008 xor r6d, r6d
6009 inc r6d
6010 mov r2, r3
6011 lea r3, [ang_table + 15 * 16]
6012 add r1, r1
6013 lea r4, [r1 * 3]
6014
6015 call ang16_mode_6_30
6016
6017 lea r2, [r2 + 16]
6018 lea r0, [r0 + 16]
6019
6020 call ang16_mode_6_30
6021
6022 RET
6023
6024cglobal ang16_mode_7_29
6025 test r6d, r6d
6026 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6027 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6028 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6029 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6030
6031 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6032 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6033 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6034
6035 mova m4, m3
6036 pmaddwd m4, [r3 - 8 * 16] ; [9]
6037 paddd m4, [pd_16]
6038 psrld m4, 5
6039 mova m2, m0
6040 pmaddwd m2, [r3 - 8 * 16]
6041 paddd m2, [pd_16]
6042 psrld m2, 5
6043 packusdw m4, m2
6044
6045 mova m2, m3
6046 pmaddwd m2, [r3 + 1 * 16] ; [18]
6047 paddd m2, [pd_16]
6048 psrld m2, 5
6049 mova m1, m0
6050 pmaddwd m1, [r3 + 1 * 16]
6051 paddd m1, [pd_16]
6052 psrld m1, 5
6053 packusdw m2, m1
6054
6055 mova m6, m3
6056 pmaddwd m6, [r3 + 10 * 16] ; [27]
6057 paddd m6, [pd_16]
6058 psrld m6, 5
6059 mova m1, m0
6060 pmaddwd m1, [r3 + 10 * 16]
6061 paddd m1, [pd_16]
6062 psrld m1, 5
6063 packusdw m6, m1
6064
6065 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6066 pmaddwd m7, [r3 - 13 * 16] ; [4]
6067 paddd m7, [pd_16]
6068 psrld m7, 5
6069 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6070 pmaddwd m1, [r3 - 13 * 16]
6071 paddd m1, [pd_16]
6072 psrld m1, 5
6073 packusdw m7, m1
6074
6075 mov r5, r0
6076
6077 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6078
6079 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6080 mova m6, m4
6081 pmaddwd m4, [r3 - 4 * 16] ; [13]
6082 paddd m4, [pd_16]
6083 psrld m4, 5
6084 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6085 mova m7, m1
6086 pmaddwd m1, [r3 - 4 * 16]
6087 paddd m1, [pd_16]
6088 psrld m1, 5
6089 packusdw m4, m1
6090
6091 mova m2, m6
6092 pmaddwd m2, [r3 + 5 * 16] ; [22]
6093 paddd m2, [pd_16]
6094 psrld m2, 5
6095 mova m1, m7
6096 pmaddwd m1, [r3 + 5 * 16]
6097 paddd m1, [pd_16]
6098 psrld m1, 5
6099 packusdw m2, m1
6100
6101 pmaddwd m6, [r3 + 14 * 16] ; [31]
6102 paddd m6, [pd_16]
6103 psrld m6, 5
6104 pmaddwd m7, [r3 + 14 * 16]
6105 paddd m7, [pd_16]
6106 psrld m7, 5
6107 packusdw m6, m7
6108
6109 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6110 pmaddwd m7, [r3 - 9 * 16] ; [8]
6111 paddd m7, [pd_16]
6112 psrld m7, 5
6113 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6114 pmaddwd m1, [r3 - 9 * 16]
6115 paddd m1, [pd_16]
6116 psrld m1, 5
6117 packusdw m7, m1
6118
6119 lea r5, [r0 + r1 * 4]
6120
6121 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
6122
6123 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6124 mova m2, m4
6125 pmaddwd m4, [r3] ; [17]
6126 paddd m4, [pd_16]
6127 psrld m4, 5
6128 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6129 mova m7, m1
6130 pmaddwd m1, [r3]
6131 paddd m1, [pd_16]
6132 psrld m1, 5
6133 packusdw m4, m1
6134
6135 pmaddwd m2, [r3 + 9 * 16] ; [26]
6136 paddd m2, [pd_16]
6137 psrld m2, 5
6138 pmaddwd m7, [r3 + 9 * 16]
6139 paddd m7, [pd_16]
6140 psrld m7, 5
6141 packusdw m2, m7
6142
6143 palignr m6, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6144 pmaddwd m6, [r3 - 14 * 16] ; [3]
6145 paddd m6, [pd_16]
6146 psrld m6, 5
6147 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6148 pmaddwd m1, [r3 - 14 * 16]
6149 paddd m1, [pd_16]
6150 psrld m1, 5
6151 packusdw m6, m1
6152
6153 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6154 pmaddwd m7, [r3 - 5 * 16] ; [12]
6155 paddd m7, [pd_16]
6156 psrld m7, 5
6157 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6158 pmaddwd m1, [r3 - 5 * 16]
6159 paddd m1, [pd_16]
6160 psrld m1, 5
6161 packusdw m7, m1
6162
6163 lea r5, [r5 + r1 * 4]
6164
6165 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
6166
6167 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6168 mova m2, m4
6169 pmaddwd m4, [r3 + 4 * 16] ; [21]
6170 paddd m4, [pd_16]
6171 psrld m4, 5
6172 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6173 mova m3, m1
6174 pmaddwd m1, [r3 + 4 * 16]
6175 paddd m1, [pd_16]
6176 psrld m1, 5
6177 packusdw m4, m1
6178
6179 pmaddwd m2, [r3 + 13 * 16] ; [30]
6180 paddd m2, [pd_16]
6181 psrld m2, 5
6182 pmaddwd m3, [r3 + 13 * 16]
6183 paddd m3, [pd_16]
6184 psrld m3, 5
6185 packusdw m2, m3
6186
6187 mova m7, m0
6188 pmaddwd m7, [r3 - 10 * 16] ; [7]
6189 paddd m7, [pd_16]
6190 psrld m7, 5
6191 mova m3, m5
6192 pmaddwd m3, [r3 - 10 * 16]
6193 paddd m3, [pd_16]
6194 psrld m3, 5
6195 packusdw m7, m3
6196
6197 pmaddwd m0, [r3 - 1 * 16] ; [16]
6198 paddd m0, [pd_16]
6199 psrld m0, 5
6200 pmaddwd m5, [r3 - 1 * 16]
6201 paddd m5, [pd_16]
6202 psrld m5, 5
6203 packusdw m0, m5
6204
6205 lea r5, [r5 + r1 * 4]
6206
6207 TRANSPOSE_STORE m4, m2, m7, m0, m3, 24
6208
6209 ret
6210
6211cglobal intra_pred_ang16_7, 3,7,8
6212 xor r6d, r6d
6213 lea r3, [ang_table + 17 * 16]
6214 add r1, r1
6215 lea r4, [r1 * 3]
6216
6217 call ang16_mode_7_29
6218
6219 lea r2, [r2 + 16]
6220 lea r0, [r0 + r1 * 8]
6221
6222 call ang16_mode_7_29
6223
6224 RET
6225
6226cglobal intra_pred_ang16_29, 4,7,8
6227 xor r6d, r6d
6228 inc r6d
6229 mov r2, r3
6230 lea r3, [ang_table + 17 * 16]
6231 add r1, r1
6232 lea r4, [r1 * 3]
6233
6234 call ang16_mode_7_29
6235
6236 lea r2, [r2 + 16]
6237 lea r0, [r0 + 16]
6238
6239 call ang16_mode_7_29
6240
6241 RET
6242
6243cglobal ang16_mode_8_28
6244 test r6d, r6d
6245 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6246 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6247 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6248 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6249
6250 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6251 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6252 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6253
6254 mova m4, m3
6255 pmaddwd m4, [r3 - 10 * 16] ; [5]
6256 paddd m4, [pd_16]
6257 psrld m4, 5
6258 mova m2, m0
6259 pmaddwd m2, [r3 - 10 * 16]
6260 paddd m2, [pd_16]
6261 psrld m2, 5
6262 packusdw m4, m2
6263
6264 mova m2, m3
6265 pmaddwd m2, [r3 - 5 * 16] ; [10]
6266 paddd m2, [pd_16]
6267 psrld m2, 5
6268 mova m1, m0
6269 pmaddwd m1, [r3 - 5 * 16]
6270 paddd m1, [pd_16]
6271 psrld m1, 5
6272 packusdw m2, m1
6273
6274 mova m6, m3
6275 pmaddwd m6, [r3] ; [15]
6276 paddd m6, [pd_16]
6277 psrld m6, 5
6278 mova m1, m0
6279 pmaddwd m1, [r3]
6280 paddd m1, [pd_16]
6281 psrld m1, 5
6282 packusdw m6, m1
6283
6284 mova m7, m3
6285 pmaddwd m7, [r3 + 5 * 16] ; [20]
6286 paddd m7, [pd_16]
6287 psrld m7, 5
6288 mova m1, m0
6289 pmaddwd m1, [r3 + 5 * 16]
6290 paddd m1, [pd_16]
6291 psrld m1, 5
6292 packusdw m7, m1
6293
6294 mov r5, r0
6295
6296 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6297
6298 mova m4, m3
6299 pmaddwd m4, [r3 + 10 * 16] ; [25]
6300 paddd m4, [pd_16]
6301 psrld m4, 5
6302 mova m1, m0
6303 pmaddwd m1, [r3 + 10 * 16]
6304 paddd m1, [pd_16]
6305 psrld m1, 5
6306 packusdw m4, m1
6307
6308 mova m2, m3
6309 pmaddwd m2, [r3 + 15 * 16] ; [30]
6310 paddd m2, [pd_16]
6311 psrld m2, 5
6312 mova m1, m0
6313 pmaddwd m1, [r3 + 15 * 16]
6314 paddd m1, [pd_16]
6315 psrld m1, 5
6316 packusdw m2, m1
6317
6318 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6319 pmaddwd m6, [r3 - 12 * 16] ; [3]
6320 paddd m6, [pd_16]
6321 psrld m6, 5
6322 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6323 pmaddwd m7, [r3 - 12 * 16]
6324 paddd m7, [pd_16]
6325 psrld m7, 5
6326 packusdw m6, m7
6327
6328 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6329 pmaddwd m7, [r3 - 7 * 16] ; [8]
6330 paddd m7, [pd_16]
6331 psrld m7, 5
6332 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6333 pmaddwd m1, [r3 - 7 * 16]
6334 paddd m1, [pd_16]
6335 psrld m1, 5
6336 packusdw m7, m1
6337
6338 lea r5, [r0 + r1 * 4]
6339
6340 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
6341
6342 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6343 mova m7, m4
6344 pmaddwd m4, [r3 - 2 *16] ; [13]
6345 paddd m4, [pd_16]
6346 psrld m4, 5
6347 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6348 mova m1, m6
6349 pmaddwd m6, [r3 - 2 * 16]
6350 paddd m6, [pd_16]
6351 psrld m6, 5
6352 packusdw m4, m6
6353
6354 mova m2, m7
6355 pmaddwd m2, [r3 + 3 * 16] ; [18]
6356 paddd m2, [pd_16]
6357 psrld m2, 5
6358 mova m6, m1
6359 pmaddwd m6, [r3 + 3 * 16]
6360 paddd m6, [pd_16]
6361 psrld m6, 5
6362 packusdw m2, m6
6363
6364 mova m6, m7
6365 pmaddwd m6, [r3 + 8 * 16] ; [23]
6366 paddd m6, [pd_16]
6367 psrld m6, 5
6368 pmaddwd m1, [r3 + 8 * 16]
6369 paddd m1, [pd_16]
6370 psrld m1, 5
6371 packusdw m6, m1
6372
6373 pmaddwd m7, [r3 + 13 * 16] ; [28]
6374 paddd m7, [pd_16]
6375 psrld m7, 5
6376 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6377 pmaddwd m1, [r3 + 13 * 16]
6378 paddd m1, [pd_16]
6379 psrld m1, 5
6380 packusdw m7, m1
6381
6382 lea r5, [r5 + r1 * 4]
6383
6384 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
6385
6386 palignr m1, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6387 mova m4, m1
6388 pmaddwd m4, [r3 - 14 * 16] ; [1]
6389 paddd m4, [pd_16]
6390 psrld m4, 5
6391 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6392 mova m0, m5
6393 pmaddwd m0, [r3 - 14 * 16]
6394 paddd m0, [pd_16]
6395 psrld m0, 5
6396 packusdw m4, m0
6397
6398 mova m2, m1
6399 pmaddwd m2, [r3 - 9 * 16] ; [6]
6400 paddd m2, [pd_16]
6401 psrld m2, 5
6402 mova m3, m5
6403 pmaddwd m3, [r3 - 9 * 16]
6404 paddd m3, [pd_16]
6405 psrld m3, 5
6406 packusdw m2, m3
6407
6408 mova m7, m1
6409 pmaddwd m7, [r3 - 4 * 16] ; [11]
6410 paddd m7, [pd_16]
6411 psrld m7, 5
6412 mova m3, m5
6413 pmaddwd m3, [r3 - 4 * 16]
6414 paddd m3, [pd_16]
6415 psrld m3, 5
6416 packusdw m7, m3
6417
6418 pmaddwd m1, [r3 + 1 * 16] ; [16]
6419 paddd m1, [pd_16]
6420 psrld m1, 5
6421 pmaddwd m5, [r3 + 1 * 16]
6422 paddd m5, [pd_16]
6423 psrld m5, 5
6424 packusdw m1, m5
6425
6426 lea r5, [r5 + r1 * 4]
6427
6428 TRANSPOSE_STORE m4, m2, m7, m1, m3, 24
6429
6430 ret
6431
6432cglobal intra_pred_ang16_8, 3,7,8
6433 xor r6d, r6d
6434 lea r3, [ang_table + 15 * 16]
6435 add r1, r1
6436 lea r4, [r1 * 3]
6437
6438 call ang16_mode_8_28
6439
6440 lea r2, [r2 + 16]
6441 lea r0, [r0 + r1 * 8]
6442
6443 call ang16_mode_8_28
6444
6445 RET
6446
6447cglobal intra_pred_ang16_28, 4,7,8
6448 xor r6d, r6d
6449 inc r6d
6450 mov r2, r3
6451 lea r3, [ang_table + 15 * 16]
6452 add r1, r1
6453 lea r4, [r1 * 3]
6454
6455 call ang16_mode_8_28
6456
6457 lea r2, [r2 + 16]
6458 lea r0, [r0 + 16]
6459
6460 call ang16_mode_8_28
6461
6462 RET
6463
6464cglobal ang16_mode_9_27
6465 test r6d, r6d
6466 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6467 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
6468
6469 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
6470 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
6471
6472 mova m4, m3
6473 pmaddwd m4, [r3 - 14 * 16] ; [2]
6474 paddd m4, [pd_16]
6475 psrld m4, 5
6476 mova m2, m0
6477 pmaddwd m2, [r3 - 14 * 16]
6478 paddd m2, [pd_16]
6479 psrld m2, 5
6480 packusdw m4, m2
6481
6482 mova m2, m3
6483 pmaddwd m2, [r3 - 12 * 16] ; [4]
6484 paddd m2, [pd_16]
6485 psrld m2, 5
6486 mova m1, m0
6487 pmaddwd m1, [r3 - 12 * 16]
6488 paddd m1, [pd_16]
6489 psrld m1, 5
6490 packusdw m2, m1
6491
6492 mova m6, m3
6493 pmaddwd m6, [r3 - 10 *16] ; [6]
6494 paddd m6, [pd_16]
6495 psrld m6, 5
6496 mova m1, m0
6497 pmaddwd m1, [r3 - 10 * 16]
6498 paddd m1, [pd_16]
6499 psrld m1, 5
6500 packusdw m6, m1
6501
6502 mova m7, m3
6503 pmaddwd m7, [r3 - 8 * 16] ; [8]
6504 paddd m7, [pd_16]
6505 psrld m7, 5
6506 mova m1, m0
6507 pmaddwd m1, [r3 - 8 * 16]
6508 paddd m1, [pd_16]
6509 psrld m1, 5
6510 packusdw m7, m1
6511
6512 mov r5, r0
6513
6514 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6515
6516 mova m4, m3
6517 pmaddwd m4, [r3 - 6 * 16] ; [10]
6518 paddd m4, [pd_16]
6519 psrld m4, 5
6520 mova m1, m0
6521 pmaddwd m1, [r3 - 6 * 16]
6522 paddd m1, [pd_16]
6523 psrld m1, 5
6524 packusdw m4, m1
6525
6526 mova m2, m3
6527 pmaddwd m2, [r3 - 4 * 16] ; [12]
6528 paddd m2, [pd_16]
6529 psrld m2, 5
6530 mova m1, m0
6531 pmaddwd m1, [r3 - 4 * 16]
6532 paddd m1, [pd_16]
6533 psrld m1, 5
6534 packusdw m2, m1
6535
6536 mova m6, m3
6537 pmaddwd m6, [r3 - 2 * 16] ; [14]
6538 paddd m6, [pd_16]
6539 psrld m6, 5
6540 mova m7, m0
6541 pmaddwd m7, [r3 - 2 * 16]
6542 paddd m7, [pd_16]
6543 psrld m7, 5
6544 packusdw m6, m7
6545
6546 mova m7, m3
6547 pmaddwd m7, [r3] ; [16]
6548 paddd m7, [pd_16]
6549 psrld m7, 5
6550 mova m1, m0
6551 pmaddwd m1, [r3]
6552 paddd m1, [pd_16]
6553 psrld m1, 5
6554 packusdw m7, m1
6555
6556 lea r5, [r0 + r1 * 4]
6557
6558 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
6559
6560 mova m4, m3
6561 pmaddwd m4, [r3 + 2 *16] ; [18]
6562 paddd m4, [pd_16]
6563 psrld m4, 5
6564 mova m6, m0
6565 pmaddwd m6, [r3 + 2 * 16]
6566 paddd m6, [pd_16]
6567 psrld m6, 5
6568 packusdw m4, m6
6569
6570 mova m2, m3
6571 pmaddwd m2, [r3 + 4 * 16] ; [20]
6572 paddd m2, [pd_16]
6573 psrld m2, 5
6574 mova m6, m0
6575 pmaddwd m6, [r3 + 4 * 16]
6576 paddd m6, [pd_16]
6577 psrld m6, 5
6578 packusdw m2, m6
6579
6580 mova m6, m3
6581 pmaddwd m6, [r3 + 6 * 16] ; [22]
6582 paddd m6, [pd_16]
6583 psrld m6, 5
6584 mova m1, m0
6585 pmaddwd m1, [r3 + 6 * 16]
6586 paddd m1, [pd_16]
6587 psrld m1, 5
6588 packusdw m6, m1
6589
6590 mova m7, m3
6591 pmaddwd m7, [r3 + 8 * 16] ; [24]
6592 paddd m7, [pd_16]
6593 psrld m7, 5
6594 mova m1, m0
6595 pmaddwd m1, [r3 + 8 * 16]
6596 paddd m1, [pd_16]
6597 psrld m1, 5
6598 packusdw m7, m1
6599
6600 lea r5, [r5 + r1 * 4]
6601
6602 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
6603
6604 mova m4, m3
6605 pmaddwd m4, [r3 + 10 * 16] ; [26]
6606 paddd m4, [pd_16]
6607 psrld m4, 5
6608 mova m1, m0
6609 pmaddwd m1, [r3 + 10 * 16]
6610 paddd m1, [pd_16]
6611 psrld m1, 5
6612 packusdw m4, m1
6613
6614 mova m2, m3
6615 pmaddwd m2, [r3 + 12 * 16] ; [28]
6616 paddd m2, [pd_16]
6617 psrld m2, 5
6618 mova m1, m0
6619 pmaddwd m1, [r3 + 12 * 16]
6620 paddd m1, [pd_16]
6621 psrld m1, 5
6622 packusdw m2, m1
6623
6624 pmaddwd m3, [r3 + 14 * 16] ; [30]
6625 paddd m3, [pd_16]
6626 psrld m3, 5
6627 pmaddwd m0, [r3 + 14 * 16]
6628 paddd m0, [pd_16]
6629 psrld m0, 5
6630 packusdw m3, m0
6631
6632 movu m7, [r2 + 4]
6633
6634 lea r5, [r5 + r1 * 4]
6635
6636 TRANSPOSE_STORE m4, m2, m3, m7, m1, 24
6637
6638 ret
6639
6640cglobal intra_pred_ang16_9, 3,7,8
6641 xor r6d, r6d
6642 lea r3, [ang_table + 16 * 16]
6643 add r1, r1
6644 lea r4, [r1 * 3]
6645
6646 call ang16_mode_9_27
6647
6648 lea r2, [r2 + 16]
6649 lea r0, [r0 + r1 * 8]
6650
6651 call ang16_mode_9_27
6652
6653 RET
6654
6655cglobal intra_pred_ang16_27, 4,7,8
6656 xor r6d, r6d
6657 inc r6d
6658 mov r2, r3
6659 lea r3, [ang_table + 16 * 16]
6660 add r1, r1
6661 lea r4, [r1 * 3]
6662
6663 call ang16_mode_9_27
6664
6665 lea r2, [r2 + 16]
6666 lea r0, [r0 + 16]
6667
6668 call ang16_mode_9_27
6669
6670 RET
6671
6672cglobal ang16_mode_11_25
6673 test r6d, r6d
6674 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
6675 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6676
6677 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
6678 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
6679
6680 mova m4, m3
6681 pmaddwd m4, [r3 + 14 * 16] ; [30]
6682 paddd m4, [pd_16]
6683 psrld m4, 5
6684 mova m2, m0
6685 pmaddwd m2, [r3 + 14 * 16]
6686 paddd m2, [pd_16]
6687 psrld m2, 5
6688 packusdw m4, m2
6689
6690 mova m2, m3
6691 pmaddwd m2, [r3 + 12 * 16] ; [28]
6692 paddd m2, [pd_16]
6693 psrld m2, 5
6694 mova m1, m0
6695 pmaddwd m1, [r3 + 12 * 16]
6696 paddd m1, [pd_16]
6697 psrld m1, 5
6698 packusdw m2, m1
6699
6700 mova m6, m3
6701 pmaddwd m6, [r3 + 10 *16] ; [26]
6702 paddd m6, [pd_16]
6703 psrld m6, 5
6704 mova m1, m0
6705 pmaddwd m1, [r3 + 10 * 16]
6706 paddd m1, [pd_16]
6707 psrld m1, 5
6708 packusdw m6, m1
6709
6710 mova m7, m3
6711 pmaddwd m7, [r3 + 8 * 16] ; [24]
6712 paddd m7, [pd_16]
6713 psrld m7, 5
6714 mova m1, m0
6715 pmaddwd m1, [r3 + 8 * 16]
6716 paddd m1, [pd_16]
6717 psrld m1, 5
6718 packusdw m7, m1
6719
6720 mov r5, r0
6721
6722 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6723
6724 mova m4, m3
6725 pmaddwd m4, [r3 + 6 * 16] ; [22]
6726 paddd m4, [pd_16]
6727 psrld m4, 5
6728 mova m1, m0
6729 pmaddwd m1, [r3 + 6 * 16]
6730 paddd m1, [pd_16]
6731 psrld m1, 5
6732 packusdw m4, m1
6733
6734 mova m2, m3
6735 pmaddwd m2, [r3 + 4 * 16] ; [20]
6736 paddd m2, [pd_16]
6737 psrld m2, 5
6738 mova m1, m0
6739 pmaddwd m1, [r3 + 4 * 16]
6740 paddd m1, [pd_16]
6741 psrld m1, 5
6742 packusdw m2, m1
6743
6744 mova m6, m3
6745 pmaddwd m6, [r3 + 2 * 16] ; [18]
6746 paddd m6, [pd_16]
6747 psrld m6, 5
6748 mova m7, m0
6749 pmaddwd m7, [r3 + 2 * 16]
6750 paddd m7, [pd_16]
6751 psrld m7, 5
6752 packusdw m6, m7
6753
6754 mova m7, m3
6755 pmaddwd m7, [r3] ; [16]
6756 paddd m7, [pd_16]
6757 psrld m7, 5
6758 mova m1, m0
6759 pmaddwd m1, [r3]
6760 paddd m1, [pd_16]
6761 psrld m1, 5
6762 packusdw m7, m1
6763
6764 lea r5, [r0 + r1 * 4]
6765
6766 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
6767
6768 mova m4, m3
6769 pmaddwd m4, [r3 - 2 *16] ; [14]
6770 paddd m4, [pd_16]
6771 psrld m4, 5
6772 mova m6, m0
6773 pmaddwd m6, [r3 - 2 * 16]
6774 paddd m6, [pd_16]
6775 psrld m6, 5
6776 packusdw m4, m6
6777
6778 mova m2, m3
6779 pmaddwd m2, [r3 - 4 * 16] ; [12]
6780 paddd m2, [pd_16]
6781 psrld m2, 5
6782 mova m6, m0
6783 pmaddwd m6, [r3 - 4 * 16]
6784 paddd m6, [pd_16]
6785 psrld m6, 5
6786 packusdw m2, m6
6787
6788 mova m6, m3
6789 pmaddwd m6, [r3 - 6 * 16] ; [10]
6790 paddd m6, [pd_16]
6791 psrld m6, 5
6792 mova m1, m0
6793 pmaddwd m1, [r3 - 6 * 16]
6794 paddd m1, [pd_16]
6795 psrld m1, 5
6796 packusdw m6, m1
6797
6798 mova m7, m3
6799 pmaddwd m7, [r3 - 8 * 16] ; [8]
6800 paddd m7, [pd_16]
6801 psrld m7, 5
6802 mova m1, m0
6803 pmaddwd m1, [r3 - 8 * 16]
6804 paddd m1, [pd_16]
6805 psrld m1, 5
6806 packusdw m7, m1
6807
6808 lea r5, [r5 + r1 * 4]
6809
6810 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
6811
6812 mova m4, m3
6813 pmaddwd m4, [r3 - 10 * 16] ; [6]
6814 paddd m4, [pd_16]
6815 psrld m4, 5
6816 mova m1, m0
6817 pmaddwd m1, [r3 - 10 * 16]
6818 paddd m1, [pd_16]
6819 psrld m1, 5
6820 packusdw m4, m1
6821
6822 mova m2, m3
6823 pmaddwd m2, [r3 - 12 * 16] ; [4]
6824 paddd m2, [pd_16]
6825 psrld m2, 5
6826 mova m1, m0
6827 pmaddwd m1, [r3 - 12 * 16]
6828 paddd m1, [pd_16]
6829 psrld m1, 5
6830 packusdw m2, m1
6831
6832 mova m7, m3
6833 pmaddwd m7, [r3 - 14 * 16] ; [2]
6834 paddd m7, [pd_16]
6835 psrld m7, 5
6836 mova m1, m0
6837 pmaddwd m1, [r3 - 14 * 16]
6838 paddd m1, [pd_16]
6839 psrld m1, 5
6840 packusdw m7, m1
6841
6842 movu m3, [r2]
6843
6844 lea r5, [r5 + r1 * 4]
6845
6846 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
6847
6848 ret
6849
6850cglobal intra_pred_ang16_11, 3,7,8
6851 xor r6d, r6d
6852 lea r3, [ang_table + 16 * 16]
6853 add r1, r1
6854 lea r4, [r1 * 3]
6855
6856 call ang16_mode_11_25
6857
6858 lea r2, [r2 + 16]
6859 lea r0, [r0 + r1 * 8]
6860
6861 call ang16_mode_11_25
6862
6863 RET
6864
6865cglobal intra_pred_ang16_25, 4,7,8
6866 xor r6d, r6d
6867 inc r6d
6868 mov r2, r3
6869 lea r3, [ang_table + 16 * 16]
6870 add r1, r1
6871 lea r4, [r1 * 3]
6872
6873 call ang16_mode_11_25
6874
6875 lea r2, [r2 + 16]
6876 lea r0, [r0 + 16]
6877
6878 call ang16_mode_11_25
6879
6880 RET
6881
6882cglobal ang16_mode_12_24
6883 test r3d, r3d
6884 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
6885 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6886
6887 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
6888 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
6889
6890 mova m4, m3
6891 pmaddwd m4, [r6 + 11 * 16] ; [27]
6892 paddd m4, [pd_16]
6893 psrld m4, 5
6894 mova m2, m0
6895 pmaddwd m2, [r6 + 11 * 16]
6896 paddd m2, [pd_16]
6897 psrld m2, 5
6898 packusdw m4, m2
6899
6900 mova m2, m3
6901 pmaddwd m2, [r6 + 6 * 16] ; [22]
6902 paddd m2, [pd_16]
6903 psrld m2, 5
6904 mova m1, m0
6905 pmaddwd m1, [r6 + 6 * 16]
6906 paddd m1, [pd_16]
6907 psrld m1, 5
6908 packusdw m2, m1
6909
6910 mova m6, m3
6911 pmaddwd m6, [r6 + 1 *16] ; [17]
6912 paddd m6, [pd_16]
6913 psrld m6, 5
6914 mova m1, m0
6915 pmaddwd m1, [r6 + 1 * 16]
6916 paddd m1, [pd_16]
6917 psrld m1, 5
6918 packusdw m6, m1
6919
6920 mova m7, m3
6921 pmaddwd m7, [r6 - 4 * 16] ; [12]
6922 paddd m7, [pd_16]
6923 psrld m7, 5
6924 mova m1, m0
6925 pmaddwd m1, [r6 - 4 * 16]
6926 paddd m1, [pd_16]
6927 psrld m1, 5
6928 packusdw m7, m1
6929
6930 mov r5, r0
6931
6932 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6933
6934 mova m4, m3
6935 pmaddwd m4, [r6 - 9 * 16] ; [7]
6936 paddd m4, [pd_16]
6937 psrld m4, 5
6938 mova m1, m0
6939 pmaddwd m1, [r6 - 9 * 16]
6940 paddd m1, [pd_16]
6941 psrld m1, 5
6942 packusdw m4, m1
6943
6944 mova m2, m3
6945 pmaddwd m2, [r6 - 14 * 16] ; [2]
6946 paddd m2, [pd_16]
6947 psrld m2, 5
6948 mova m1, m0
6949 pmaddwd m1, [r6 - 14 * 16]
6950 paddd m1, [pd_16]
6951 psrld m1, 5
6952 packusdw m2, m1
6953
6954 palignr m0, m3, 12
6955 palignr m3, m5, 12
6956
6957 mova m6, m3
6958 pmaddwd m6, [r6 + 13 * 16] ; [29]
6959 paddd m6, [pd_16]
6960 psrld m6, 5
6961 mova m7, m0
6962 pmaddwd m7, [r6 + 13 * 16]
6963 paddd m7, [pd_16]
6964 psrld m7, 5
6965 packusdw m6, m7
6966
6967 mova m7, m3
6968 pmaddwd m7, [r6 + 8 * 16] ; [24]
6969 paddd m7, [pd_16]
6970 psrld m7, 5
6971 mova m1, m0
6972 pmaddwd m1, [r6 + 8 * 16]
6973 paddd m1, [pd_16]
6974 psrld m1, 5
6975 packusdw m7, m1
6976
6977 lea r5, [r0 + r1 * 4]
6978
6979 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
6980
6981 mova m4, m3
6982 pmaddwd m4, [r6 + 3 *16] ; [19]
6983 paddd m4, [pd_16]
6984 psrld m4, 5
6985 mova m6, m0
6986 pmaddwd m6, [r6 + 3 * 16]
6987 paddd m6, [pd_16]
6988 psrld m6, 5
6989 packusdw m4, m6
6990
6991 mova m2, m3
6992 pmaddwd m2, [r6 - 2 * 16] ; [14]
6993 paddd m2, [pd_16]
6994 psrld m2, 5
6995 mova m6, m0
6996 pmaddwd m6, [r6 - 2 * 16]
6997 paddd m6, [pd_16]
6998 psrld m6, 5
6999 packusdw m2, m6
7000
7001 mova m6, m3
7002 pmaddwd m6, [r6 - 7 * 16] ; [9]
7003 paddd m6, [pd_16]
7004 psrld m6, 5
7005 mova m1, m0
7006 pmaddwd m1, [r6 - 7 * 16]
7007 paddd m1, [pd_16]
7008 psrld m1, 5
7009 packusdw m6, m1
7010
7011 mova m7, m3
7012 pmaddwd m7, [r6 - 12 * 16] ; [4]
7013 paddd m7, [pd_16]
7014 psrld m7, 5
7015 mova m1, m0
7016 pmaddwd m1, [r6 - 12 * 16]
7017 paddd m1, [pd_16]
7018 psrld m1, 5
7019 packusdw m7, m1
7020
7021 lea r5, [r5 + r1 * 4]
7022
7023 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7024
7025 pslldq m5, 2
7026 palignr m0, m3, 12
7027 palignr m3, m5, 12
7028
7029 mova m4, m3
7030 pmaddwd m4, [r6 + 15 * 16] ; [31]
7031 paddd m4, [pd_16]
7032 psrld m4, 5
7033 mova m1, m0
7034 pmaddwd m1, [r6 + 15 * 16]
7035 paddd m1, [pd_16]
7036 psrld m1, 5
7037 packusdw m4, m1
7038
7039 mova m2, m3
7040 pmaddwd m2, [r6 + 10 * 16] ; [26]
7041 paddd m2, [pd_16]
7042 psrld m2, 5
7043 mova m1, m0
7044 pmaddwd m1, [r6 + 10 * 16]
7045 paddd m1, [pd_16]
7046 psrld m1, 5
7047 packusdw m2, m1
7048
7049 mova m7, m3
7050 pmaddwd m7, [r6 + 5 * 16] ; [21]
7051 paddd m7, [pd_16]
7052 psrld m7, 5
7053 mova m1, m0
7054 pmaddwd m1, [r6 + 5 * 16]
7055 paddd m1, [pd_16]
7056 psrld m1, 5
7057 packusdw m7, m1
7058
7059 pmaddwd m3, [r6] ; [16]
7060 paddd m3, [pd_16]
7061 psrld m3, 5
7062 pmaddwd m0, [r6]
7063 paddd m0, [pd_16]
7064 psrld m0, 5
7065 packusdw m3, m0
7066
7067 lea r5, [r5 + r1 * 4]
7068
7069 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
7070
7071 ret
7072
7073cglobal intra_pred_ang16_12, 4,7,8
7074 add r1, r1
7075 lea r4, [r1 * 3]
7076 lea r6, [ang_table + 16 * 16]
7077 movu m5, [r3]
7078 pshufb m5, [pw_ang8_12]
7079 pinsrw m5, [r3 + 26], 5
7080 xor r3d, r3d
7081
7082 call ang16_mode_12_24
7083
7084 lea r0, [r0 + r1 * 8]
7085 movu m5, [r2 + 2]
7086 lea r2, [r2 + 16]
7087
7088 call ang16_mode_12_24
7089
7090 RET
7091
7092cglobal intra_pred_ang16_24, 4,7,8
7093 xchg r2, r3
7094 add r1, r1
7095 lea r4, [r1 * 3]
7096 lea r6, [ang_table + 16 * 16]
7097 movu m5, [r3]
7098 pshufb m5, [pw_ang8_12]
7099 pinsrw m5, [r3 + 26], 5
7100 xor r3d, r3d
7101 inc r3d
7102
7103 call ang16_mode_12_24
7104
7105 lea r0, [r0 + 16]
7106 movu m5, [r2 + 2]
7107 lea r2, [r2 + 16]
7108
7109 call ang16_mode_12_24
7110
7111 RET
7112
7113cglobal ang16_mode_13_23
7114 test r3d, r3d
7115 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
7116 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7117
7118 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
7119 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
7120
7121 mova m4, m3
7122 pmaddwd m4, [r6 + 8 * 16] ; [23]
7123 paddd m4, [pd_16]
7124 psrld m4, 5
7125 mova m2, m0
7126 pmaddwd m2, [r6 + 8 * 16]
7127 paddd m2, [pd_16]
7128 psrld m2, 5
7129 packusdw m4, m2
7130
7131 mova m2, m3
7132 pmaddwd m2, [r6 - 1 * 16] ; [14]
7133 paddd m2, [pd_16]
7134 psrld m2, 5
7135 mova m1, m0
7136 pmaddwd m1, [r6 - 1 * 16]
7137 paddd m1, [pd_16]
7138 psrld m1, 5
7139 packusdw m2, m1
7140
7141 mova m6, m3
7142 pmaddwd m6, [r6 - 10 *16] ; [5]
7143 paddd m6, [pd_16]
7144 psrld m6, 5
7145 mova m1, m0
7146 pmaddwd m1, [r6 - 10 * 16]
7147 paddd m1, [pd_16]
7148 psrld m1, 5
7149 packusdw m6, m1
7150
7151 palignr m0, m3, 12
7152 palignr m3, m5, 12
7153
7154 mova m7, m3
7155 pmaddwd m7, [r6 + 13 * 16] ; [28]
7156 paddd m7, [pd_16]
7157 psrld m7, 5
7158 mova m1, m0
7159 pmaddwd m1, [r6 + 13 * 16]
7160 paddd m1, [pd_16]
7161 psrld m1, 5
7162 packusdw m7, m1
7163
7164 mov r5, r0
7165
7166 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7167
7168 mova m4, m3
7169 pmaddwd m4, [r6 + 4 * 16] ; [19]
7170 paddd m4, [pd_16]
7171 psrld m4, 5
7172 mova m1, m0
7173 pmaddwd m1, [r6 + 4 * 16]
7174 paddd m1, [pd_16]
7175 psrld m1, 5
7176 packusdw m4, m1
7177
7178 mova m2, m3
7179 pmaddwd m2, [r6 - 5 * 16] ; [10]
7180 paddd m2, [pd_16]
7181 psrld m2, 5
7182 mova m1, m0
7183 pmaddwd m1, [r6 - 5 * 16]
7184 paddd m1, [pd_16]
7185 psrld m1, 5
7186 packusdw m2, m1
7187
7188 mova m6, m3
7189 pmaddwd m6, [r6 - 14 * 16] ; [1]
7190 paddd m6, [pd_16]
7191 psrld m6, 5
7192 mova m7, m0
7193 pmaddwd m7, [r6 - 14 * 16]
7194 paddd m7, [pd_16]
7195 psrld m7, 5
7196 packusdw m6, m7
7197
7198 pslldq m5, 2
7199 palignr m0, m3, 12
7200 palignr m3, m5, 12
7201
7202 mova m7, m3
7203 pmaddwd m7, [r6 + 9 * 16] ; [24]
7204 paddd m7, [pd_16]
7205 psrld m7, 5
7206 mova m1, m0
7207 pmaddwd m1, [r6 + 9 * 16]
7208 paddd m1, [pd_16]
7209 psrld m1, 5
7210 packusdw m7, m1
7211
7212 lea r5, [r0 + r1 * 4]
7213
7214 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7215
7216 mova m4, m3
7217 pmaddwd m4, [r6] ; [15]
7218 paddd m4, [pd_16]
7219 psrld m4, 5
7220 mova m6, m0
7221 pmaddwd m6, [r6]
7222 paddd m6, [pd_16]
7223 psrld m6, 5
7224 packusdw m4, m6
7225
7226 mova m2, m3
7227 pmaddwd m2, [r6 - 9 * 16] ; [6]
7228 paddd m2, [pd_16]
7229 psrld m2, 5
7230 mova m6, m0
7231 pmaddwd m6, [r6 - 9 * 16]
7232 paddd m6, [pd_16]
7233 psrld m6, 5
7234 packusdw m2, m6
7235
7236 pslldq m5, 2
7237 palignr m0, m3, 12
7238 palignr m3, m5, 12
7239
7240 mova m6, m3
7241 pmaddwd m6, [r6 + 14 * 16] ; [29]
7242 paddd m6, [pd_16]
7243 psrld m6, 5
7244 mova m1, m0
7245 pmaddwd m1, [r6 + 14 * 16]
7246 paddd m1, [pd_16]
7247 psrld m1, 5
7248 packusdw m6, m1
7249
7250 mova m7, m3
7251 pmaddwd m7, [r6 + 5 * 16] ; [20]
7252 paddd m7, [pd_16]
7253 psrld m7, 5
7254 mova m1, m0
7255 pmaddwd m1, [r6 + 5 * 16]
7256 paddd m1, [pd_16]
7257 psrld m1, 5
7258 packusdw m7, m1
7259
7260 lea r5, [r5 + r1 * 4]
7261
7262 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7263
7264 mova m4, m3
7265 pmaddwd m4, [r6 - 4 * 16] ; [11]
7266 paddd m4, [pd_16]
7267 psrld m4, 5
7268 mova m1, m0
7269 pmaddwd m1, [r6 - 4 * 16]
7270 paddd m1, [pd_16]
7271 psrld m1, 5
7272 packusdw m4, m1
7273
7274 mova m2, m3
7275 pmaddwd m2, [r6 - 13 * 16] ; [2]
7276 paddd m2, [pd_16]
7277 psrld m2, 5
7278 mova m1, m0
7279 pmaddwd m1, [r6 - 13 * 16]
7280 paddd m1, [pd_16]
7281 psrld m1, 5
7282 packusdw m2, m1
7283
7284 pslldq m5, 2
7285 palignr m0, m3, 12
7286 palignr m3, m5, 12
7287
7288 mova m7, m3
7289 pmaddwd m7, [r6 + 10 * 16] ; [25]
7290 paddd m7, [pd_16]
7291 psrld m7, 5
7292 mova m1, m0
7293 pmaddwd m1, [r6 + 10 * 16]
7294 paddd m1, [pd_16]
7295 psrld m1, 5
7296 packusdw m7, m1
7297
7298 pmaddwd m3, [r6 + 1 * 16] ; [16]
7299 paddd m3, [pd_16]
7300 psrld m3, 5
7301 pmaddwd m0, [r6 + 1 *16]
7302 paddd m0, [pd_16]
7303 psrld m0, 5
7304 packusdw m3, m0
7305
7306 lea r5, [r5 + r1 * 4]
7307
7308 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
7309
7310 ret
7311
7312cglobal intra_pred_ang16_13, 4,7,8
7313 add r1, r1
7314 lea r4, [r1 * 3]
7315 lea r6, [ang_table + 15 * 16]
7316 movu m5, [r3]
7317 pshufb m5, [pw_ang16_13]
7318 movu m6, [r3 + 14]
7319 pshufb m6, [pw_ang8_13]
7320 pslldq m6, 2
7321 palignr m5, m6, 6
7322 xor r3d, r3d
7323
7324 call ang16_mode_13_23
7325
7326 lea r0, [r0 + r1 * 8]
7327 movu m5, [r2 + 2]
7328 lea r2, [r2 + 16]
7329
7330 call ang16_mode_13_23
7331
7332 RET
7333
7334cglobal intra_pred_ang16_23, 4,7,8
7335 xchg r2, r3
7336 add r1, r1
7337 lea r4, [r1 * 3]
7338 lea r6, [ang_table + 15 * 16]
7339 movu m5, [r3]
7340 pshufb m5, [pw_ang16_13]
7341 movu m6, [r3 + 14]
7342 pshufb m6, [pw_ang8_13]
7343 pslldq m6, 2
7344 palignr m5, m6, 6
7345 xor r3d, r3d
7346 inc r3d
7347
7348 call ang16_mode_13_23
7349
7350 lea r0, [r0 + 16]
7351 movu m5, [r2 + 2]
7352 lea r2, [r2 + 16]
7353
7354 call ang16_mode_13_23
7355
7356 RET
7357
7358cglobal ang16_mode_14_22
7359 test r3d, r3d
7360 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
7361 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7362
7363 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
7364 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
7365
7366 mova m4, m3
7367 pmaddwd m4, [r6 + 1 * 16] ; [19]
7368 paddd m4, [pd_16]
7369 psrld m4, 5
7370 mova m2, m0
7371 pmaddwd m2, [r6 + 1 * 16]
7372 paddd m2, [pd_16]
7373 psrld m2, 5
7374 packusdw m4, m2
7375
7376 mova m2, m3
7377 pmaddwd m2, [r6 - 12 * 16] ; [6]
7378 paddd m2, [pd_16]
7379 psrld m2, 5
7380 mova m1, m0
7381 pmaddwd m1, [r6 - 12 * 16]
7382 paddd m1, [pd_16]
7383 psrld m1, 5
7384 packusdw m2, m1
7385
7386 palignr m0, m3, 12
7387 palignr m3, m5, 12
7388
7389 mova m6, m3
7390 pmaddwd m6, [r6 + 7 * 16] ; [25]
7391 paddd m6, [pd_16]
7392 psrld m6, 5
7393 mova m1, m0
7394 pmaddwd m1, [r6 + 7 * 16]
7395 paddd m1, [pd_16]
7396 psrld m1, 5
7397 packusdw m6, m1
7398
7399 mova m7, m3
7400 pmaddwd m7, [r6 - 6 * 16] ; [12]
7401 paddd m7, [pd_16]
7402 psrld m7, 5
7403 mova m1, m0
7404 pmaddwd m1, [r6 - 6 * 16]
7405 paddd m1, [pd_16]
7406 psrld m1, 5
7407 packusdw m7, m1
7408
7409 mov r5, r0
7410
7411 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7412
7413 pslldq m5, 2
7414 palignr m0, m3, 12
7415 palignr m3, m5, 12
7416
7417 mova m4, m3
7418 pmaddwd m4, [r6 + 13 * 16] ; [31]
7419 paddd m4, [pd_16]
7420 psrld m4, 5
7421 mova m1, m0
7422 pmaddwd m1, [r6 + 13 * 16]
7423 paddd m1, [pd_16]
7424 psrld m1, 5
7425 packusdw m4, m1
7426
7427 mova m2, m3
7428 pmaddwd m2, [r6] ; [18]
7429 paddd m2, [pd_16]
7430 psrld m2, 5
7431 mova m1, m0
7432 pmaddwd m1, [r6]
7433 paddd m1, [pd_16]
7434 psrld m1, 5
7435 packusdw m2, m1
7436
7437 mova m6, m3
7438 pmaddwd m6, [r6 - 13 * 16] ; [5]
7439 paddd m6, [pd_16]
7440 psrld m6, 5
7441 mova m7, m0
7442 pmaddwd m7, [r6 - 13 * 16]
7443 paddd m7, [pd_16]
7444 psrld m7, 5
7445 packusdw m6, m7
7446
7447 pslldq m5, 2
7448 palignr m0, m3, 12
7449 palignr m3, m5, 12
7450
7451 mova m7, m3
7452 pmaddwd m7, [r6 + 6 * 16] ; [24]
7453 paddd m7, [pd_16]
7454 psrld m7, 5
7455 mova m1, m0
7456 pmaddwd m1, [r6 + 6 * 16]
7457 paddd m1, [pd_16]
7458 psrld m1, 5
7459 packusdw m7, m1
7460
7461 lea r5, [r0 + r1 * 4]
7462
7463 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7464
7465 mova m4, m3
7466 pmaddwd m4, [r6 - 7 * 16] ; [11]
7467 paddd m4, [pd_16]
7468 psrld m4, 5
7469 mova m6, m0
7470 pmaddwd m6, [r6 - 7 * 16]
7471 paddd m6, [pd_16]
7472 psrld m6, 5
7473 packusdw m4, m6
7474
7475 pslldq m5, 2
7476 palignr m0, m3, 12
7477 palignr m3, m5, 12
7478
7479 mova m2, m3
7480 pmaddwd m2, [r6 + 12 * 16] ; [30]
7481 paddd m2, [pd_16]
7482 psrld m2, 5
7483 mova m6, m0
7484 pmaddwd m6, [r6 + 12 * 16]
7485 paddd m6, [pd_16]
7486 psrld m6, 5
7487 packusdw m2, m6
7488
7489 mova m6, m3
7490 pmaddwd m6, [r6 - 1 * 16] ; [17]
7491 paddd m6, [pd_16]
7492 psrld m6, 5
7493 mova m1, m0
7494 pmaddwd m1, [r6 - 1 * 16]
7495 paddd m1, [pd_16]
7496 psrld m1, 5
7497 packusdw m6, m1
7498
7499 mova m7, m3
7500 pmaddwd m7, [r6 - 14 * 16] ; [4]
7501 paddd m7, [pd_16]
7502 psrld m7, 5
7503 mova m1, m0
7504 pmaddwd m1, [r6 - 14 * 16]
7505 paddd m1, [pd_16]
7506 psrld m1, 5
7507 packusdw m7, m1
7508
7509 lea r5, [r5 + r1 * 4]
7510
7511 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7512
7513 pslldq m5, 2
7514 palignr m0, m3, 12
7515 palignr m3, m5, 12
7516
7517 mova m4, m3
7518 pmaddwd m4, [r6 + 5 * 16] ; [23]
7519 paddd m4, [pd_16]
7520 psrld m4, 5
7521 mova m1, m0
7522 pmaddwd m1, [r6 + 5 * 16]
7523 paddd m1, [pd_16]
7524 psrld m1, 5
7525 packusdw m4, m1
7526
7527 mova m2, m3
7528 pmaddwd m2, [r6 - 8 * 16] ; [10]
7529 paddd m2, [pd_16]
7530 psrld m2, 5
7531 mova m1, m0
7532 pmaddwd m1, [r6 - 8 * 16]
7533 paddd m1, [pd_16]
7534 psrld m1, 5
7535 packusdw m2, m1
7536
7537 pslldq m5, 2
7538 palignr m0, m3, 12
7539 palignr m3, m5, 12
7540
7541 mova m7, m3
7542 pmaddwd m7, [r6 + 11 * 16] ; [29]
7543 paddd m7, [pd_16]
7544 psrld m7, 5
7545 mova m1, m0
7546 pmaddwd m1, [r6 + 11 * 16]
7547 paddd m1, [pd_16]
7548 psrld m1, 5
7549 packusdw m7, m1
7550
7551 pmaddwd m3, [r6 - 2 * 16] ; [16]
7552 paddd m3, [pd_16]
7553 psrld m3, 5
7554 pmaddwd m0, [r6 - 2 *16]
7555 paddd m0, [pd_16]
7556 psrld m0, 5
7557 packusdw m3, m0
7558
7559 lea r5, [r5 + r1 * 4]
7560
7561 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
7562
7563 ret
7564
7565cglobal intra_pred_ang16_14, 4,7,8
7566 add r1, r1
7567 lea r4, [r1 * 3]
7568 lea r6, [ang_table + 18 * 16]
7569 movu m6, [r3]
7570 pshufb m6, [pw_ang8_14]
7571 movu m5, [r3 + 20]
7572 pshufb m5, [pw_ang8_14]
7573 punpckhqdq m5, m6
7574 xor r3d, r3d
7575
7576 call ang16_mode_14_22
7577
7578 lea r0, [r0 + r1 * 8]
7579 movu m5, [r2 + 2]
7580 lea r2, [r2 + 16]
7581
7582 call ang16_mode_14_22
7583
7584 RET
7585
7586cglobal intra_pred_ang16_22, 4,7,8
7587 xchg r2, r3
7588 add r1, r1
7589 lea r4, [r1 * 3]
7590 lea r6, [ang_table + 18 * 16]
7591 movu m6, [r3]
7592 pshufb m6, [pw_ang8_14]
7593 movu m5, [r3 + 20]
7594 pshufb m5, [pw_ang8_14]
7595 punpckhqdq m5, m6
7596 xor r3d, r3d
7597 inc r3d
7598
7599 call ang16_mode_14_22
7600
7601 lea r0, [r0 + 16]
7602 movu m5, [r2 + 2]
7603 lea r2, [r2 + 16]
7604
7605 call ang16_mode_14_22
7606
7607 RET
7608
7609cglobal ang16_mode_15_21
7610 test r3d, r3d
7611 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
7612 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7613
7614 palignr m6, m0, m5, 2
7615
7616 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
7617 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
7618
7619 mova m4, m3
7620 pmaddwd m4, [r6] ; [15]
7621 paddd m4, [pd_16]
7622 psrld m4, 5
7623 mova m2, m0
7624 pmaddwd m2, [r6]
7625 paddd m2, [pd_16]
7626 psrld m2, 5
7627 packusdw m4, m2
7628
7629 palignr m0, m3, 12
7630 palignr m3, m6, 12
7631
7632 mova m2, m3
7633 pmaddwd m2, [r6 + 15 * 16] ; [30]
7634 paddd m2, [pd_16]
7635 psrld m2, 5
7636 mova m1, m0
7637 pmaddwd m1, [r6 + 15 * 16]
7638 paddd m1, [pd_16]
7639 psrld m1, 5
7640 packusdw m2, m1
7641
7642 mova m6, m3
7643 pmaddwd m6, [r6 - 2 * 16] ; [13]
7644 paddd m6, [pd_16]
7645 psrld m6, 5
7646 mova m1, m0
7647 pmaddwd m1, [r6 - 2 * 16]
7648 paddd m1, [pd_16]
7649 psrld m1, 5
7650 packusdw m6, m1
7651
7652 palignr m0, m3, 12
7653 palignr m3, m5, 12
7654
7655 mova m7, m3
7656 pmaddwd m7, [r6 + 13 * 16] ; [28]
7657 paddd m7, [pd_16]
7658 psrld m7, 5
7659 mova m1, m0
7660 pmaddwd m1, [r6 + 13 * 16]
7661 paddd m1, [pd_16]
7662 psrld m1, 5
7663 packusdw m7, m1
7664
7665 mov r5, r0
7666
7667 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7668
7669 mova m4, m3
7670 pmaddwd m4, [r6 - 4 * 16] ; [11]
7671 paddd m4, [pd_16]
7672 psrld m4, 5
7673 mova m1, m0
7674 pmaddwd m1, [r6 - 4 * 16]
7675 paddd m1, [pd_16]
7676 psrld m1, 5
7677 packusdw m4, m1
7678
7679 pslldq m5, 2
7680 palignr m0, m3, 12
7681 palignr m3, m5, 12
7682
7683 mova m2, m3
7684 pmaddwd m2, [r6 + 11 * 16] ; [26]
7685 paddd m2, [pd_16]
7686 psrld m2, 5
7687 mova m1, m0
7688 pmaddwd m1, [r6 + 11 * 16]
7689 paddd m1, [pd_16]
7690 psrld m1, 5
7691 packusdw m2, m1
7692
7693 mova m6, m3
7694 pmaddwd m6, [r6 - 6 * 16] ; [9]
7695 paddd m6, [pd_16]
7696 psrld m6, 5
7697 mova m7, m0
7698 pmaddwd m7, [r6 - 6 * 16]
7699 paddd m7, [pd_16]
7700 psrld m7, 5
7701 packusdw m6, m7
7702
7703 pslldq m5, 2
7704 palignr m0, m3, 12
7705 palignr m3, m5, 12
7706
7707 mova m7, m3
7708 pmaddwd m7, [r6 + 9 * 16] ; [24]
7709 paddd m7, [pd_16]
7710 psrld m7, 5
7711 mova m1, m0
7712 pmaddwd m1, [r6 + 9 * 16]
7713 paddd m1, [pd_16]
7714 psrld m1, 5
7715 packusdw m7, m1
7716
7717 lea r5, [r0 + r1 * 4]
7718
7719 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7720
7721 mova m4, m3
7722 pmaddwd m4, [r6 - 8 * 16] ; [7]
7723 paddd m4, [pd_16]
7724 psrld m4, 5
7725 mova m6, m0
7726 pmaddwd m6, [r6 - 8 * 16]
7727 paddd m6, [pd_16]
7728 psrld m6, 5
7729 packusdw m4, m6
7730
7731 pslldq m5, 2
7732 palignr m0, m3, 12
7733 palignr m3, m5, 12
7734
7735 mova m2, m3
7736 pmaddwd m2, [r6 + 7 * 16] ; [22]
7737 paddd m2, [pd_16]
7738 psrld m2, 5
7739 mova m6, m0
7740 pmaddwd m6, [r6 + 7 * 16]
7741 paddd m6, [pd_16]
7742 psrld m6, 5
7743 packusdw m2, m6
7744
7745 mova m6, m3
7746 pmaddwd m6, [r6 - 10 * 16] ; [5]
7747 paddd m6, [pd_16]
7748 psrld m6, 5
7749 mova m1, m0
7750 pmaddwd m1, [r6 - 10 * 16]
7751 paddd m1, [pd_16]
7752 psrld m1, 5
7753 packusdw m6, m1
7754
7755 pslldq m5, 2
7756 palignr m0, m3, 12
7757 palignr m3, m5, 12
7758
7759 mova m7, m3
7760 pmaddwd m7, [r6 + 5 * 16] ; [20]
7761 paddd m7, [pd_16]
7762 psrld m7, 5
7763 mova m1, m0
7764 pmaddwd m1, [r6 + 5 * 16]
7765 paddd m1, [pd_16]
7766 psrld m1, 5
7767 packusdw m7, m1
7768
7769 lea r5, [r5 + r1 * 4]
7770
7771 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7772
7773 mova m4, m3
7774 pmaddwd m4, [r6 - 12 * 16] ; [3]
7775 paddd m4, [pd_16]
7776 psrld m4, 5
7777 mova m1, m0
7778 pmaddwd m1, [r6 - 12 * 16]
7779 paddd m1, [pd_16]
7780 psrld m1, 5
7781 packusdw m4, m1
7782
7783 pslldq m5, 2
7784 palignr m0, m3, 12
7785 palignr m3, m5, 12
7786
7787 mova m2, m3
7788 pmaddwd m2, [r6 + 3 * 16] ; [18]
7789 paddd m2, [pd_16]
7790 psrld m2, 5
7791 mova m1, m0
7792 pmaddwd m1, [r6 + 3 * 16]
7793 paddd m1, [pd_16]
7794 psrld m1, 5
7795 packusdw m2, m1
7796
7797 mova m7, m3
7798 pmaddwd m7, [r6 - 14 * 16] ; [1]
7799 paddd m7, [pd_16]
7800 psrld m7, 5
7801 mova m1, m0
7802 pmaddwd m1, [r6 - 14 * 16]
7803 paddd m1, [pd_16]
7804 psrld m1, 5
7805 packusdw m7, m1
7806
7807 pslldq m5, 2
7808 palignr m0, m3, 12
7809 palignr m3, m5, 12
7810
7811 pmaddwd m3, [r6 + 1 * 16] ; [16]
7812 paddd m3, [pd_16]
7813 psrld m3, 5
7814 pmaddwd m0, [r6 + 1 * 16]
7815 paddd m0, [pd_16]
7816 psrld m0, 5
7817 packusdw m3, m0
7818
7819 lea r5, [r5 + r1 * 4]
7820
7821 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
7822
7823 ret
7824
7825cglobal intra_pred_ang16_15, 4,7,8
7826 add r1, r1
7827 lea r4, [r1 * 3]
7828 lea r6, [ang_table + 15 * 16]
7829 movu m6, [r3 + 4]
7830 pshufb m6, [pw_ang8_15]
7831 movu m5, [r3 + 18]
7832 pshufb m5, [pw_ang8_15]
7833 punpckhqdq m5, m6
7834 xor r3d, r3d
7835
7836 call ang16_mode_15_21
7837
7838 lea r0, [r0 + r1 * 8]
7839 movu m5, [r2]
7840 lea r2, [r2 + 16]
7841
7842 call ang16_mode_15_21
7843
7844 RET
7845
7846cglobal intra_pred_ang16_21, 4,7,8
7847 xchg r2, r3
7848 add r1, r1
7849 lea r4, [r1 * 3]
7850 lea r6, [ang_table + 15 * 16]
7851 movu m6, [r3 + 4]
7852 pshufb m6, [pw_ang8_15]
7853 movu m5, [r3 + 18]
7854 pshufb m5, [pw_ang8_15]
7855 punpckhqdq m5, m6
7856 xor r3d, r3d
7857 inc r3d
7858
7859 call ang16_mode_15_21
7860
7861 lea r0, [r0 + 16]
7862 movu m5, [r2]
7863 lea r2, [r2 + 16]
7864
7865 call ang16_mode_15_21
7866
7867 RET
7868
7869cglobal ang16_mode_16_20
7870 test r4d, r4d
7871 lea r4, [r1 * 3]
7872 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
7873 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7874
7875 palignr m6, m0, m5, 2
7876
7877 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
7878 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
7879
7880 mova m4, m3
7881 pmaddwd m4, [r6 - 2 * 16] ; [11]
7882 paddd m4, [pd_16]
7883 psrld m4, 5
7884 mova m2, m0
7885 pmaddwd m2, [r6 - 2 * 16]
7886 paddd m2, [pd_16]
7887 psrld m2, 5
7888 packusdw m4, m2
7889
7890 palignr m0, m3, 12
7891 palignr m3, m6, 12
7892
7893 mova m2, m3
7894 pmaddwd m2, [r6 + 9 * 16] ; [22]
7895 paddd m2, [pd_16]
7896 psrld m2, 5
7897 mova m1, m0
7898 pmaddwd m1, [r6 + 9 * 16]
7899 paddd m1, [pd_16]
7900 psrld m1, 5
7901 packusdw m2, m1
7902
7903 mova m6, m3
7904 pmaddwd m6, [r6 - 12 * 16] ; [1]
7905 paddd m6, [pd_16]
7906 psrld m6, 5
7907 mova m1, m0
7908 pmaddwd m1, [r6 - 12 * 16]
7909 paddd m1, [pd_16]
7910 psrld m1, 5
7911 packusdw m6, m1
7912
7913 palignr m0, m3, 12
7914 palignr m3, m5, 12
7915
7916 mova m7, m3
7917 pmaddwd m7, [r6 - 1 * 16] ; [12]
7918 paddd m7, [pd_16]
7919 psrld m7, 5
7920 mova m1, m0
7921 pmaddwd m1, [r6 - 1 * 16]
7922 paddd m1, [pd_16]
7923 psrld m1, 5
7924 packusdw m7, m1
7925
7926 mov r5, r0
7927
7928 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7929
7930 pslldq m5, 2
7931 palignr m0, m3, 12
7932 palignr m3, m5, 12
7933
7934 mova m4, m3
7935 pmaddwd m4, [r6 + 10 * 16] ; [23]
7936 paddd m4, [pd_16]
7937 psrld m4, 5
7938 mova m1, m0
7939 pmaddwd m1, [r6 + 10 * 16]
7940 paddd m1, [pd_16]
7941 psrld m1, 5
7942 packusdw m4, m1
7943
7944 mova m2, m3
7945 pmaddwd m2, [r6 - 11 * 16] ; [2]
7946 paddd m2, [pd_16]
7947 psrld m2, 5
7948 mova m1, m0
7949 pmaddwd m1, [r6 - 11 * 16]
7950 paddd m1, [pd_16]
7951 psrld m1, 5
7952 packusdw m2, m1
7953
7954 pslldq m5, 2
7955 palignr m0, m3, 12
7956 palignr m3, m5, 12
7957
7958 mova m6, m3
7959 pmaddwd m6, [r6] ; [13]
7960 paddd m6, [pd_16]
7961 psrld m6, 5
7962 mova m7, m0
7963 pmaddwd m7, [r6]
7964 paddd m7, [pd_16]
7965 psrld m7, 5
7966 packusdw m6, m7
7967
7968 pslldq m5, 2
7969 palignr m0, m3, 12
7970 palignr m3, m5, 12
7971
7972 mova m7, m3
7973 pmaddwd m7, [r6 + 11 * 16] ; [24]
7974 paddd m7, [pd_16]
7975 psrld m7, 5
7976 mova m1, m0
7977 pmaddwd m1, [r6 + 11 * 16]
7978 paddd m1, [pd_16]
7979 psrld m1, 5
7980 packusdw m7, m1
7981
7982 lea r5, [r0 + r1 * 4]
7983
7984 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7985
7986 mova m4, m3
7987 pmaddwd m4, [r6 - 10 * 16] ; [3]
7988 paddd m4, [pd_16]
7989 psrld m4, 5
7990 mova m6, m0
7991 pmaddwd m6, [r6 - 10 * 16]
7992 paddd m6, [pd_16]
7993 psrld m6, 5
7994 packusdw m4, m6
7995
7996 pslldq m5, 2
7997 palignr m0, m3, 12
7998 palignr m3, m5, 12
7999
8000 mova m2, m3
8001 pmaddwd m2, [r6 + 1 * 16] ; [14]
8002 paddd m2, [pd_16]
8003 psrld m2, 5
8004 mova m6, m0
8005 pmaddwd m6, [r6 + 1 * 16]
8006 paddd m6, [pd_16]
8007 psrld m6, 5
8008 packusdw m2, m6
8009
8010 pslldq m5, 2
8011 palignr m0, m3, 12
8012 palignr m3, m5, 12
8013
8014 mova m6, m3
8015 pmaddwd m6, [r6 + 12 * 16] ; [25]
8016 paddd m6, [pd_16]
8017 psrld m6, 5
8018 mova m1, m0
8019 pmaddwd m1, [r6 + 12 * 16]
8020 paddd m1, [pd_16]
8021 psrld m1, 5
8022 packusdw m6, m1
8023
8024 mova m7, m3
8025 pmaddwd m7, [r6 - 9 * 16] ; [4]
8026 paddd m7, [pd_16]
8027 psrld m7, 5
8028 mova m1, m0
8029 pmaddwd m1, [r6 - 9 * 16]
8030 paddd m1, [pd_16]
8031 psrld m1, 5
8032 packusdw m7, m1
8033
8034 lea r5, [r5 + r1 * 4]
8035
8036 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
8037
8038 pslldq m5, 2
8039 palignr m0, m3, 12
8040 palignr m3, m5, 12
8041
8042 mova m4, m3
8043 pmaddwd m4, [r6 + 2 * 16] ; [15]
8044 paddd m4, [pd_16]
8045 psrld m4, 5
8046 mova m1, m0
8047 pmaddwd m1, [r6 + 2 * 16]
8048 paddd m1, [pd_16]
8049 psrld m1, 5
8050 packusdw m4, m1
8051
8052 movu m5, [r3]
8053 pshufb m5, [pw_ang8_16]
8054
8055 palignr m0, m3, 12
8056 palignr m3, m5, 12
8057
8058 mova m2, m3
8059 pmaddwd m2, [r6 + 13 * 16] ; [26]
8060 paddd m2, [pd_16]
8061 psrld m2, 5
8062 mova m1, m0
8063 pmaddwd m1, [r6 + 13 * 16]
8064 paddd m1, [pd_16]
8065 psrld m1, 5
8066 packusdw m2, m1
8067
8068 mova m7, m3
8069 pmaddwd m7, [r6 - 8 * 16] ; [5]
8070 paddd m7, [pd_16]
8071 psrld m7, 5
8072 mova m1, m0
8073 pmaddwd m1, [r6 - 8 * 16]
8074 paddd m1, [pd_16]
8075 psrld m1, 5
8076 packusdw m7, m1
8077
8078 pslldq m5, 2
8079 palignr m0, m3, 12
8080 palignr m3, m5, 12
8081
8082 pmaddwd m3, [r6 + 3 * 16] ; [16]
8083 paddd m3, [pd_16]
8084 psrld m3, 5
8085 pmaddwd m0, [r6 + 3 * 16]
8086 paddd m0, [pd_16]
8087 psrld m0, 5
8088 packusdw m3, m0
8089
8090 lea r5, [r5 + r1 * 4]
8091
8092 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
8093
8094 ret
8095
8096cglobal intra_pred_ang16_16, 4,7,8,0-(1*mmsize)
8097 add r1, r1
8098 lea r6, [ang_table + 13 * 16]
8099 movu m6, [r3 + 4]
8100 pshufb m6, [pw_ang16_16]
8101 movu m5, [r3 + 16]
8102 pshufb m5, [pw_ang16_16]
8103 punpckhqdq m5, m6
8104 mov [rsp], r3
8105 lea r3, [r3 + 24]
8106 xor r4, r4
8107
8108 call ang16_mode_16_20
8109
8110 lea r0, [r0 + r1 * 8]
8111 mov r3, [rsp]
8112 movu m5, [r2]
8113 lea r2, [r2 + 16]
8114 xor r4, r4
8115
8116 call ang16_mode_16_20
8117
8118 RET
8119
8120cglobal intra_pred_ang16_20, 4,7,8,0-(1*mmsize)
8121 xchg r2, r3
8122 add r1, r1
8123 lea r6, [ang_table + 13 * 16]
8124 movu m6, [r3 + 4]
8125 pshufb m6, [pw_ang16_16]
8126 movu m5, [r3 + 16]
8127 pshufb m5, [pw_ang16_16]
8128 punpckhqdq m5, m6
8129 mov [rsp], r3
8130 lea r3, [r3 + 24]
8131 xor r4, r4
8132 inc r4
8133
8134 call ang16_mode_16_20
8135
8136 lea r0, [r0 + 16]
8137 mov r3, [rsp]
8138 movu m5, [r2]
8139 lea r2, [r2 + 16]
8140 xor r4, r4
8141 inc r4
8142
8143 call ang16_mode_16_20
8144
8145 RET
8146
8147cglobal ang16_mode_17_19
8148 test r4d, r4d
8149 lea r4, [r1 * 3]
8150 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
8151 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8152
8153 palignr m6, m0, m5, 2
8154
8155 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
8156 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
8157
8158 mova m4, m3
8159 pmaddwd m4, [r6 - 10 * 16] ; [6]
8160 paddd m4, [pd_16]
8161 psrld m4, 5
8162 mova m2, m0
8163 pmaddwd m2, [r6 - 10 * 16]
8164 paddd m2, [pd_16]
8165 psrld m2, 5
8166 packusdw m4, m2
8167
8168 palignr m0, m3, 12
8169 palignr m3, m6, 12
8170
8171 mova m2, m3
8172 pmaddwd m2, [r6 - 4 * 16] ; [12]
8173 paddd m2, [pd_16]
8174 psrld m2, 5
8175 mova m1, m0
8176 pmaddwd m1, [r6 - 4 * 16]
8177 paddd m1, [pd_16]
8178 psrld m1, 5
8179 packusdw m2, m1
8180
8181 palignr m0, m3, 12
8182 palignr m3, m5, 12
8183
8184 mova m6, m3
8185 pmaddwd m6, [r6 + 2 * 16] ; [18]
8186 paddd m6, [pd_16]
8187 psrld m6, 5
8188 mova m1, m0
8189 pmaddwd m1, [r6 + 2 * 16]
8190 paddd m1, [pd_16]
8191 psrld m1, 5
8192 packusdw m6, m1
8193
8194 pslldq m5, 2
8195 palignr m0, m3, 12
8196 palignr m3, m5, 12
8197
8198 mova m7, m3
8199 pmaddwd m7, [r6 + 8 * 16] ; [24]
8200 paddd m7, [pd_16]
8201 psrld m7, 5
8202 mova m1, m0
8203 pmaddwd m1, [r6 + 8 * 16]
8204 paddd m1, [pd_16]
8205 psrld m1, 5
8206 packusdw m7, m1
8207
8208 mov r5, r0
8209
8210 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
8211
8212 pslldq m5, 2
8213 palignr m0, m3, 12
8214 palignr m3, m5, 12
8215
8216 mova m4, m3
8217 pmaddwd m4, [r6 + 14 * 16] ; [30]
8218 paddd m4, [pd_16]
8219 psrld m4, 5
8220 mova m1, m0
8221 pmaddwd m1, [r6 + 14 * 16]
8222 paddd m1, [pd_16]
8223 psrld m1, 5
8224 packusdw m4, m1
8225
8226 mova m2, m3
8227 pmaddwd m2, [r6 - 12 * 16] ; [4]
8228 paddd m2, [pd_16]
8229 psrld m2, 5
8230 mova m1, m0
8231 pmaddwd m1, [r6 - 12 * 16]
8232 paddd m1, [pd_16]
8233 psrld m1, 5
8234 packusdw m2, m1
8235
8236 pslldq m5, 2
8237 palignr m0, m3, 12
8238 palignr m3, m5, 12
8239
8240 mova m6, m3
8241 pmaddwd m6, [r6 - 6 * 16] ; [10]
8242 paddd m6, [pd_16]
8243 psrld m6, 5
8244 mova m7, m0
8245 pmaddwd m7, [r6 - 6 * 16]
8246 paddd m7, [pd_16]
8247 psrld m7, 5
8248 packusdw m6, m7
8249
8250 pslldq m5, 2
8251 palignr m0, m3, 12
8252 palignr m3, m5, 12
8253
8254 mova m7, m3
8255 pmaddwd m7, [r6] ; [16]
8256 paddd m7, [pd_16]
8257 psrld m7, 5
8258 mova m1, m0
8259 pmaddwd m1, [r6]
8260 paddd m1, [pd_16]
8261 psrld m1, 5
8262 packusdw m7, m1
8263
8264 lea r5, [r0 + r1 * 4]
8265
8266 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
8267
8268 pslldq m5, 2
8269 palignr m0, m3, 12
8270 palignr m3, m5, 12
8271
8272 mova m4, m3
8273 pmaddwd m4, [r6 + 6 * 16] ; [22]
8274 paddd m4, [pd_16]
8275 psrld m4, 5
8276 mova m6, m0
8277 pmaddwd m6, [r6 + 6 * 16]
8278 paddd m6, [pd_16]
8279 psrld m6, 5
8280 packusdw m4, m6
8281
8282 pslldq m5, 2
8283 palignr m0, m3, 12
8284 palignr m3, m5, 12
8285
8286 mova m2, m3
8287 pmaddwd m2, [r6 + 12 * 16] ; [28]
8288 paddd m2, [pd_16]
8289 psrld m2, 5
8290 mova m6, m0
8291 pmaddwd m6, [r6 + 12 * 16]
8292 paddd m6, [pd_16]
8293 psrld m6, 5
8294 packusdw m2, m6
8295
8296 mova m6, m3
8297 pmaddwd m6, [r6 - 14 * 16] ; [2]
8298 paddd m6, [pd_16]
8299 psrld m6, 5
8300 mova m1, m0
8301 pmaddwd m1, [r6 - 14 * 16]
8302 paddd m1, [pd_16]
8303 psrld m1, 5
8304 packusdw m6, m1
8305
8306 movu m5, [r3]
8307 pshufb m5, [pw_ang8_17]
8308
8309 palignr m0, m3, 12
8310 palignr m3, m5, 12
8311
8312 mova m7, m3
8313 pmaddwd m7, [r6 - 8 * 16] ; [8]
8314 paddd m7, [pd_16]
8315 psrld m7, 5
8316 mova m1, m0
8317 pmaddwd m1, [r6 - 8 * 16]
8318 paddd m1, [pd_16]
8319 psrld m1, 5
8320 packusdw m7, m1
8321
8322 lea r5, [r5 + r1 * 4]
8323
8324 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
8325
8326 pslldq m5, 2
8327 palignr m0, m3, 12
8328 palignr m3, m5, 12
8329
8330 mova m4, m3
8331 pmaddwd m4, [r6 - 2 * 16] ; [14]
8332 paddd m4, [pd_16]
8333 psrld m4, 5
8334 mova m1, m0
8335 pmaddwd m1, [r6 - 2 * 16]
8336 paddd m1, [pd_16]
8337 psrld m1, 5
8338 packusdw m4, m1
8339
8340 pslldq m5, 2
8341 palignr m0, m3, 12
8342 palignr m3, m5, 12
8343
8344 mova m2, m3
8345 pmaddwd m2, [r6 + 4 * 16] ; [20]
8346 paddd m2, [pd_16]
8347 psrld m2, 5
8348 mova m1, m0
8349 pmaddwd m1, [r6 + 4 * 16]
8350 paddd m1, [pd_16]
8351 psrld m1, 5
8352 packusdw m2, m1
8353
8354 pslldq m5, 2
8355 palignr m0, m3, 12
8356 palignr m3, m5, 12
8357
8358 mova m7, m3
8359 pmaddwd m7, [r6 + 10 * 16] ; [26]
8360 paddd m7, [pd_16]
8361 psrld m7, 5
8362 mova m1, m0
8363 pmaddwd m1, [r6 + 10 * 16]
8364 paddd m1, [pd_16]
8365 psrld m1, 5
8366 packusdw m7, m1
8367
8368 pmaddwd m3, [r6 - 16 * 16]
8369 paddd m3, [pd_16]
8370 psrld m3, 5
8371 pmaddwd m0, [r6 - 16 * 16]
8372 paddd m0, [pd_16]
8373 psrld m0, 5
8374 packusdw m3, m0
8375
8376 lea r5, [r5 + r1 * 4]
8377
8378 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
8379
8380 ret
8381
8382cglobal intra_pred_ang16_17, 4,7,8,0-(1*mmsize)
8383 add r1, r1
8384 lea r6, [ang_table + 16 * 16]
8385 movu m6, [r3 + 2]
8386 pshufb m6, [pw_ang16_16]
8387 movu m5, [r3 + 12]
8388 pshufb m5, [pw_ang16_16]
8389 punpckhqdq m5, m6
8390 mov [rsp], r3
8391 lea r3, [r3 + 20]
8392 xor r4, r4
8393
8394 call ang16_mode_17_19
8395
8396 lea r0, [r0 + r1 * 8]
8397 mov r3, [rsp]
8398 movu m5, [r2]
8399 lea r2, [r2 + 16]
8400 xor r4, r4
8401
8402 call ang16_mode_17_19
8403
8404 RET
8405
8406cglobal intra_pred_ang16_19, 4,7,8,0-(1*mmsize)
8407 xchg r2, r3
8408 add r1, r1
8409 lea r6, [ang_table + 16 * 16]
8410 movu m6, [r3 + 2]
8411 pshufb m6, [pw_ang16_16]
8412 movu m5, [r3 + 12]
8413 pshufb m5, [pw_ang16_16]
8414 punpckhqdq m5, m6
8415 mov [rsp], r3
8416 lea r3, [r3 + 20]
8417 xor r4, r4
8418 inc r4
8419
8420 call ang16_mode_17_19
8421
8422 lea r0, [r0 + 16]
8423 mov r3, [rsp]
8424 movu m5, [r2]
8425 lea r2, [r2 + 16]
8426 xor r4, r4
8427 inc r4
8428
8429 call ang16_mode_17_19
8430
8431 RET
8432
8433cglobal intra_pred_ang16_18, 4,5,4
8434 add r1, r1
8435 lea r4, [r1 * 3]
8436 movu m1, [r3]
8437 movu m3, [r3 + 16]
8438 movu m0, [r2 + 2]
8439 pshufb m0, [pw_swap16]
8440 movu [r0], m1
8441 movu [r0 + 16], m3
8442 palignr m2, m1, m0, 14
8443 movu [r0 + r1], m2
8444 palignr m2, m3, m1, 14
8445 movu [r0 + r1 + 16], m2
8446 palignr m2, m1, m0, 12
8447 movu [r0 + r1 * 2], m2
8448 palignr m2, m3, m1, 12
8449 movu [r0 + r1 * 2 + 16], m2
8450 palignr m2, m1, m0, 10
8451 movu [r0 + r4], m2
8452 palignr m2, m3, m1, 10
8453 movu [r0 + r4 + 16], m2
8454
8455 lea r0, [r0 + r1 * 4]
8456 palignr m2, m1, m0, 8
8457 movu [r0], m2
8458 palignr m2, m3, m1, 8
8459 movu [r0 + 16], m2
8460 palignr m2, m1, m0, 6
8461 movu [r0 + r1], m2
8462 palignr m2, m3, m1, 6
8463 movu [r0 + r1 + 16], m2
8464 palignr m2, m1, m0, 4
8465 movu [r0 + r1 * 2], m2
8466 palignr m2, m3, m1, 4
8467 movu [r0 + r1 * 2 + 16], m2
8468 palignr m2, m1, m0, 2
8469 movu [r0 + r4], m2
8470 palignr m3, m1, 2
8471 movu [r0 + r4 + 16], m3
8472
8473 lea r0, [r0 + r1 * 4]
8474 movu [r0], m0
8475 movu [r0 + 16], m1
8476 movu m3, [r2 + 18]
8477 pshufb m3, [pw_swap16]
8478 palignr m2, m0, m3, 14
8479 movu [r0 + r1], m2
8480 palignr m2, m1, m0, 14
8481 movu [r0 + r1 + 16], m2
8482 palignr m2, m0, m3, 12
8483 movu [r0 + r1 * 2], m2
8484 palignr m2, m1, m0, 12
8485 movu [r0 + r1 * 2 + 16], m2
8486 palignr m2, m0, m3, 10
8487 movu [r0 + r4], m2
8488 palignr m2, m1, m0, 10
8489 movu [r0 + r4 + 16], m2
8490
8491 lea r0, [r0 + r1 * 4]
8492 palignr m2, m0, m3, 8
8493 movu [r0], m2
8494 palignr m2, m1, m0, 8
8495 movu [r0 + 16], m2
8496 palignr m2, m0, m3, 6
8497 movu [r0 + r1], m2
8498 palignr m2, m1, m0, 6
8499 movu [r0 + r1 + 16], m2
8500 palignr m2, m0, m3, 4
8501 movu [r0 + r1 * 2], m2
8502 palignr m2, m1, m0, 4
8503 movu [r0 + r1 * 2 + 16], m2
8504 palignr m2, m0, m3, 2
8505 movu [r0 + r4], m2
8506 palignr m1, m0, 2
8507 movu [r0 + r4 + 16], m1
8508
8509 RET
8510
8511cglobal intra_pred_ang16_10, 4,5,4
8512 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8513 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
8514 pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
8515 add r1, r1
8516 lea r4, [r1 * 3]
8517
8518 psrldq m1, 2
8519 pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
8520 movu [r0 + r1], m2
8521 movu [r0 + r1 + 16], m2
8522 psrldq m1, 2
8523 pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
8524 movu [r0 + r1 * 2], m2
8525 movu [r0 + r1 * 2 + 16], m2
8526 psrldq m1, 2
8527 pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
8528 movu [r0 + r4], m2
8529 movu [r0 + r4 + 16], m2
8530
8531 lea r2, [r0 + r1 *4]
8532 psrldq m1, 2
8533 pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
8534 movu [r2], m2
8535 movu [r2 + 16], m2
8536 psrldq m1, 2
8537 pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
8538 movu [r2 + r1], m2
8539 movu [r2 + r1 + 16], m2
8540 psrldq m1, 2
8541 pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
8542 movu [r2 + r1 * 2], m2
8543 movu [r2 + r1 * 2 + 16], m2
8544 psrldq m1, 2
8545 pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
8546 movu [r2 + r4], m2
8547 movu [r2 + r4 + 16], m2
8548
8549 lea r2, [r2 + r1 *4]
8550 pshufb m2, m3, [pw_unpackwdq] ; [9 9 9 9 9 9 9 9]
8551 movu [r2], m2
8552 movu [r2 + 16], m2
8553 psrldq m3, 2
8554 pshufb m2, m3, [pw_unpackwdq] ; [10 10 10 10 10 10 10 10]
8555 movu [r2 + r1], m2
8556 movu [r2 + r1 + 16], m2
8557 psrldq m3, 2
8558 pshufb m2, m3, [pw_unpackwdq] ; [11 11 11 11 11 11 11 11]
8559 movu [r2 + r1 * 2], m2
8560 movu [r2 + r1 * 2 + 16], m2
8561 psrldq m3, 2
8562 pshufb m2, m3, [pw_unpackwdq] ; [12 12 12 12 12 12 12 12]
8563 movu [r2 + r4], m2
8564 movu [r2 + r4 + 16], m2
8565
8566 lea r2, [r2 + r1 *4]
8567 psrldq m3, 2
8568 pshufb m2, m3, [pw_unpackwdq] ; [13 13 13 13 13 13 13 13]
8569 movu [r2], m2
8570 movu [r2 + 16], m2
8571 psrldq m3, 2
8572 pshufb m2, m3, [pw_unpackwdq] ; [14 14 14 14 14 14 14 14]
8573 movu [r2 + r1], m2
8574 movu [r2 + r1 + 16], m2
8575 psrldq m3, 2
8576 pshufb m2, m3, [pw_unpackwdq] ; [15 15 15 15 15 15 15 15]
8577 movu [r2 + r1 * 2], m2
8578 movu [r2 + r1 * 2 + 16], m2
8579 psrldq m3, 2
8580 pshufb m2, m3, [pw_unpackwdq] ; [16 16 16 16 16 16 16 16]
8581 movu [r2 + r4], m2
8582 movu [r2 + r4 + 16], m2
8583 mova m3, m0
8584
8585 cmp r5m, byte 0
8586 jz .quit
8587
8588 ; filter
8589
8590 movh m1, [r3] ; [3 2 1 0]
8591 pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
8592 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
8593 movu m3, [r3 + 18] ; [16 15 14 13 12 11 10 9]
8594 psubw m1, m2
8595 psubw m3, m2
8596 psraw m1, 1
8597 psraw m3, 1
8598 paddw m3, m0
8599 paddw m0, m1
8600 pxor m1, m1
8601 pmaxsw m0, m1
8602 pminsw m0, [pw_1023]
8603 pmaxsw m3, m1
8604 pminsw m3, [pw_1023]
8605.quit:
8606 movu [r0], m0
8607 movu [r0 + 16], m3
8608
8609 RET
8610
8611cglobal intra_pred_ang16_26, 4,5,4
8612 movu m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
8613 movu m3, [r3 + 18] ; [16 15 14 13 12 11 10 9]
8614 add r1, r1
8615 lea r4, [r1 * 3]
8616
8617 movu [r0], m0
8618 movu [r0 + 16], m3
8619 movu [r0 + r1], m0
8620 movu [r0 + r1 + 16], m3
8621 movu [r0 + r1 * 2], m0
8622 movu [r0 + r1 * 2 + 16], m3
8623 movu [r0 + r4], m0
8624 movu [r0 + r4 + 16], m3
8625
8626 lea r3, [r0 + r1 *4]
8627 movu [r3], m0
8628 movu [r3 + 16], m3
8629 movu [r3 + r1], m0
8630 movu [r3 + r1 + 16], m3
8631 movu [r3 + r1 * 2], m0
8632 movu [r3 + r1 * 2 + 16], m3
8633 movu [r3 + r4], m0
8634 movu [r3 + r4 + 16], m3
8635
8636 lea r3, [r3 + r1 *4]
8637 movu [r3], m0
8638 movu [r3 + 16], m3
8639 movu [r3 + r1], m0
8640 movu [r3 + r1 + 16], m3
8641 movu [r3 + r1 * 2], m0
8642 movu [r3 + r1 * 2 + 16], m3
8643 movu [r3 + r4], m0
8644 movu [r3 + r4 + 16], m3
8645
8646 lea r3, [r3 + r1 *4]
8647 movu [r3], m0
8648 movu [r3 + 16], m3
8649 movu [r3 + r1], m0
8650 movu [r3 + r1 + 16], m3
8651 movu [r3 + r1 * 2], m0
8652 movu [r3 + r1 * 2 + 16], m3
8653 movu [r3 + r4], m0
8654 movu [r3 + r4 + 16], m3
8655
8656 cmp r5m, byte 0
8657 jz .quit
8658
8659 ; filter
8660
8661 pshufb m0, [pw_unpackwdq]
8662 movh m1, [r2] ; [3 2 1 0]
8663 pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
8664 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8665 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
8666 psubw m1, m2
8667 psubw m3, m2
8668 psraw m1, 1
8669 psraw m3, 1
8670 paddw m3, m0
8671 paddw m0, m1
8672 pxor m1, m1
8673 pmaxsw m0, m1
8674 pminsw m0, [pw_1023]
8675 pmaxsw m3, m1
8676 pminsw m3, [pw_1023]
8677 pextrw [r0], m0, 0
8678 pextrw [r0 + r1], m0, 1
8679 pextrw [r0 + r1 * 2], m0, 2
8680 pextrw [r0 + r4], m0, 3
8681 lea r0, [r0 + r1 * 4]
8682 pextrw [r0], m0, 4
8683 pextrw [r0 + r1], m0, 5
8684 pextrw [r0 + r1 * 2], m0, 6
8685 pextrw [r0 + r4], m0, 7
8686 lea r0, [r0 + r1 * 4]
8687 pextrw [r0], m3, 0
8688 pextrw [r0 + r1], m3, 1
8689 pextrw [r0 + r1 * 2], m3, 2
8690 pextrw [r0 + r4], m3, 3
8691 pextrw [r3], m3, 4
8692 pextrw [r3 + r1], m3, 5
8693 pextrw [r3 + r1 * 2], m3, 6
8694 pextrw [r3 + r4], m3, 7
8695
8696.quit:
8697 RET
8698
8699%macro MODE_2_34 0
8700 movu m0, [r2 + 4]
8701 movu m1, [r2 + 20]
8702 movu m2, [r2 + 36]
8703 movu m3, [r2 + 52]
8704 movu m4, [r2 + 68]
8705 movu [r0], m0
8706 movu [r0 + 16], m1
8707 movu [r0 + 32], m2
8708 movu [r0 + 48], m3
8709 palignr m5, m1, m0, 2
8710 movu [r0 + r1], m5
8711 palignr m5, m2, m1, 2
8712 movu [r0 + r1 + 16], m5
8713 palignr m5, m3, m2, 2
8714 movu [r0 + r1 + 32], m5
8715 palignr m5, m4, m3, 2
8716 movu [r0 + r1 + 48], m5
8717 palignr m5, m1, m0, 4
8718 movu [r0 + r3], m5
8719 palignr m5, m2, m1, 4
8720 movu [r0 + r3 + 16], m5
8721 palignr m5, m3, m2, 4
8722 movu [r0 + r3 + 32], m5
8723 palignr m5, m4, m3, 4
8724 movu [r0 + r3 + 48], m5
8725 palignr m5, m1, m0, 6
8726 movu [r0 + r4], m5
8727 palignr m5, m2, m1, 6
8728 movu [r0 + r4 + 16], m5
8729 palignr m5, m3, m2, 6
8730 movu [r0 + r4 + 32], m5
8731 palignr m5, m4, m3, 6
8732 movu [r0 + r4 + 48], m5
8733 lea r0, [r0 + r1 * 4]
8734 palignr m5, m1, m0, 8
8735 movu [r0], m5
8736 palignr m5, m2, m1, 8
8737 movu [r0 + 16], m5
8738 palignr m5, m3, m2, 8
8739 movu [r0 + 32], m5
8740 palignr m5, m4, m3, 8
8741 movu [r0 + 48], m5
8742 palignr m5, m1, m0, 10
8743 movu [r0 + r1], m5
8744 palignr m5, m2, m1, 10
8745 movu [r0 + r1 + 16], m5
8746 palignr m5, m3, m2, 10
8747 movu [r0 + r1 + 32], m5
8748 palignr m5, m4, m3, 10
8749 movu [r0 + r1 + 48], m5
8750 palignr m5, m1, m0, 12
8751 movu [r0 + r3], m5
8752 palignr m5, m2, m1, 12
8753 movu [r0 + r3 + 16], m5
8754 palignr m5, m3, m2, 12
8755 movu [r0 + r3 + 32], m5
8756 palignr m5, m4, m3, 12
8757 movu [r0 + r3 + 48], m5
8758 palignr m5, m1, m0, 14
8759 movu [r0 + r4], m5
8760 palignr m5, m2, m1, 14
8761 movu [r0 + r4 + 16], m5
8762 palignr m5, m3, m2, 14
8763 movu [r0 + r4 + 32], m5
8764 palignr m5, m4, m3, 14
8765 movu [r0 + r4 + 48], m5
8766 lea r0, [r0 + r1 * 4]
8767 movu m0, [r2 + 84]
8768 movu [r0], m1
8769 movu [r0 + 16], m2
8770 movu [r0 + 32], m3
8771 movu [r0 + 48], m4
8772 palignr m5, m2, m1, 2
8773 movu [r0 + r1], m5
8774 palignr m5, m3, m2, 2
8775 movu [r0 + r1 + 16], m5
8776 palignr m5, m4, m3, 2
8777 movu [r0 + r1 + 32], m5
8778 palignr m5, m0, m4, 2
8779 movu [r0 + r1 + 48], m5
8780 palignr m5, m2, m1, 4
8781 movu [r0 + r3], m5
8782 palignr m5, m3, m2, 4
8783 movu [r0 + r3 + 16], m5
8784 palignr m5, m4, m3, 4
8785 movu [r0 + r3 + 32], m5
8786 palignr m5, m0, m4, 4
8787 movu [r0 + r3 + 48], m5
8788 palignr m5, m2, m1, 6
8789 movu [r0 + r4], m5
8790 palignr m5, m3, m2, 6
8791 movu [r0 + r4 + 16], m5
8792 palignr m5, m4, m3, 6
8793 movu [r0 + r4 + 32], m5
8794 palignr m5, m0, m4, 6
8795 movu [r0 + r4 + 48], m5
8796 lea r0, [r0 + r1 * 4]
8797 palignr m5, m2, m1, 8
8798 movu [r0], m5
8799 palignr m5, m3, m2, 8
8800 movu [r0 + 16], m5
8801 palignr m5, m4, m3, 8
8802 movu [r0 + 32], m5
8803 palignr m5, m0, m4, 8
8804 movu [r0 + 48], m5
8805 palignr m5, m2, m1, 10
8806 movu [r0 + r1], m5
8807 palignr m5, m3, m2, 10
8808 movu [r0 + r1 + 16], m5
8809 palignr m5, m4, m3, 10
8810 movu [r0 + r1 + 32], m5
8811 palignr m5, m0, m4, 10
8812 movu [r0 + r1 + 48], m5
8813 palignr m5, m2, m1, 12
8814 movu [r0 + r3], m5
8815 palignr m5, m3, m2, 12
8816 movu [r0 + r3 + 16], m5
8817 palignr m5, m4, m3, 12
8818 movu [r0 + r3 + 32], m5
8819 palignr m5, m0, m4, 12
8820 movu [r0 + r3 + 48], m5
8821 palignr m5, m2, m1, 14
8822 movu [r0 + r4], m5
8823 palignr m5, m3, m2, 14
8824 movu [r0 + r4 + 16], m5
8825 palignr m5, m4, m3, 14
8826 movu [r0 + r4 + 32], m5
8827 palignr m5, m0, m4, 14
8828 movu [r0 + r4 + 48], m5
8829 lea r0, [r0 + r1 * 4]
8830%endmacro
8831;--------------------------------------------------------------------------------------------------------------------
8832; void intraPredAng32_2_34(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8833;--------------------------------------------------------------------------------------------------------------------
8834INIT_XMM ssse3
8835cglobal intra_pred_ang32_2, 3,6,6
8836 cmp r4m, byte 34
8837 cmove r2, r3mp
8838
8839 add r1, r1
8840 lea r3, [r1 * 2]
8841 lea r4, [r1 * 3]
8842 mov r5, 2
8843
8844.loop:
8845 MODE_2_34
8846 add r2, 32
8847 dec r5
8848 jnz .loop
8849 RET
8850
8851%macro TRANSPOSE_STORE_8x8 6
8852 %if %2 == 1
8853 ; transpose 4x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
8854 punpckhwd m0, %3, %4
8855 punpcklwd %3, %4
8856 punpckhwd %4, %3, m0
8857 punpcklwd %3, m0
8858
8859 punpckhwd m0, %5, %6
8860 punpcklwd %5, %6
8861 punpckhwd %6, %5, m0
8862 punpcklwd %5, m0
8863
8864 punpckhqdq m0, %3, %5
8865 punpcklqdq %3, %5
8866 punpcklqdq %5, %4, %6
8867 punpckhqdq %4, %6
8868
8869 movu [r0 + %1], %3
8870 movu [r0 + r1 + %1], m0
8871 movu [r0 + r1 * 2 + %1], %5
8872 movu [r0 + r5 + %1], %4
8873 %else
8874 ; store 8x4, used by angle BLOCK_16x16 and BLOCK_32x32
8875 movh [r0], %3
8876 movhps [r0 + r1], %3
8877 movh [r0 + r1 * 2], %4
8878 movhps [r0 + r5], %4
8879 lea r0, [r0 + r1 * 4]
8880 movh [r0], %5
8881 movhps [r0 + r1], %5
8882 movh [r0 + r1 * 2], %6
8883 movhps [r0 + r5], %6
8884 lea r0, [r0 + r1 * 4]
8885 %endif
8886%endmacro
8887
8888%macro MODE_3_33 1
8889 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8890 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
8891 mova m7, m0
8892
8893 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
8894 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] xmm2
8895 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] xmm0
8896
8897 palignr m1, m2, m0, 4 ; [6 5 5 4 4 3 3 2] xmm1
8898 pmaddwd m4, m0, [r3 + 10 * 16] ; [26]
8899 paddd m4, [pd_16]
8900 psrld m4, 5
8901
8902 pmaddwd m5, m1, [r3 + 4 * 16] ; [20]
8903 paddd m5, [pd_16]
8904 psrld m5, 5
8905 packusdw m4, m5
8906
8907 palignr m5, m2, m0, 8
8908 pmaddwd m5, [r3 - 2 * 16] ; [14]
8909 paddd m5, [pd_16]
8910 psrld m5, 5
8911
8912 palignr m6, m2, m0, 12
8913 pmaddwd m6, [r3 - 8 * 16] ; [ 8]
8914 paddd m6, [pd_16]
8915 psrld m6, 5
8916 packusdw m5, m6
8917
8918 pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2]
8919 paddd m6, [pd_16]
8920 psrld m6, 5
8921
8922 pmaddwd m1, m2, [r3 + 12 * 16] ; [28]
8923 paddd m1, [pd_16]
8924 psrld m1, 5
8925 packusdw m6, m1
8926
8927 palignr m0, m3, m2, 4 ; [10 9 9 8 8 7 7 6]
8928 pmaddwd m1, m0, [r3 + 6 * 16] ; [22]
8929 paddd m1, [pd_16]
8930 psrld m1, 5
8931
8932 psrldq m2, m3, 2 ; [x 16 15 14 13 12 11 10]
8933 palignr m2, m0, 4 ;[11 10 10 9 9 8 8 7]
8934
8935 pmaddwd m2, [r3] ; [16]
8936 paddd m2, [pd_16]
8937 psrld m2, 5
8938 packusdw m1, m2
8939
8940 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
8941
8942 palignr m0, m3, m7, 14 ; [15 14 13 12 11 10 9 8]
8943 movu m3, [r2 + 32] ; [23 22 21 20 19 18 17 16]
8944 palignr m1, m3, m0, 2 ; [16 15 14 13 12 11 10 9]
8945 punpckhwd m7, m0, m1 ; [16 15 15 14 14 13 13 12]
8946 punpcklwd m0, m1 ; [12 11 11 10 10 9 9 8]
8947
8948 palignr m5, m7, m0, 4 ; [13 12 12 11 11 10 10 9]
8949 pmaddwd m4, m0, [r3 - 6 * 16] ; [10]
8950 paddd m4, [pd_16]
8951 psrld m4, 5
8952
8953 pmaddwd m1, m5, [r3 - 12 * 16] ; [04]
8954 paddd m1, [pd_16]
8955 psrld m1, 5
8956 packusdw m4, m1
8957
8958 pmaddwd m5, [r3 + 14 * 16] ; [30]
8959 paddd m5, [pd_16]
8960 psrld m5, 5
8961
8962 palignr m6, m7, m0, 8 ; [14 13 13 12 12 11 11 10]
8963 pmaddwd m6, [r3 + 8 * 16] ; [24]
8964 paddd m6, [pd_16]
8965 psrld m6, 5
8966 packusdw m5, m6
8967
8968 palignr m1, m7, m0, 12 ; [15 14 14 13 13 12 12 11]
8969 pmaddwd m6, m1, [r3 + 2 * 16] ; [18]
8970 paddd m6, [pd_16]
8971 psrld m6, 5
8972
8973 pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
8974 paddd m1, [pd_16]
8975 psrld m1, 5
8976 packusdw m6, m1
8977
8978 palignr m2, m3, m7, 4 ; [17 16 16 15 15 14 14 13]
8979 pmaddwd m1, m2, [r3 - 10 * 16] ; [6]
8980 paddd m1, [pd_16]
8981 psrld m1, 5
8982
8983 packusdw m1, m1
8984 movhps m1, [r2 + 28] ; [00]
8985
8986 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
8987
8988 movu m0, [r2 + 28] ; [35 34 33 32 31 30 29 28]
8989 palignr m1, m0, 2 ; [ x 35 34 33 32 31 30 29]
8990 punpckhwd m2, m0, m1 ; [ x 35 35 34 34 33 33 32]
8991 punpcklwd m0, m1 ; [32 31 31 30 30 29 29 28]
8992
8993 pmaddwd m4, m0, [r3 + 10 * 16] ; [26]
8994 paddd m4, [pd_16]
8995 psrld m4, 5
8996
8997 palignr m1, m2, m0, 4 ; [33 32 32 31 31 30 30 29]
8998 pmaddwd m1, [r3 + 4 * 16] ; [20]
8999 paddd m1, [pd_16]
9000 psrld m1, 5
9001 packusdw m4, m1
9002
9003 palignr m5, m2, m0, 8 ; [34 33 33 32 32 31 31 30]
9004 pmaddwd m5, [r3 - 2 * 16] ; [14]
9005 paddd m5, [pd_16]
9006 psrld m5, 5
9007
9008 palignr m6, m2, m0, 12 ; [35 34 34 33 33 32 32 31]
9009 pmaddwd m6, [r3 - 8 * 16] ; [ 8]
9010 paddd m6, [pd_16]
9011 psrld m6, 5
9012 packusdw m5, m6
9013
9014 pinsrw m2, [r2 + 44], 7 ; [35 34 34 33 33 32 32 31]
9015 pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2]
9016 paddd m6, [pd_16]
9017 psrld m6, 5
9018
9019 pmaddwd m2, [r3 + 12 * 16] ; [28]
9020 paddd m2, [pd_16]
9021 psrld m2, 5
9022 packusdw m6, m2
9023
9024 movu m3, [r2 + 38] ; [45 44 43 42 41 40 39 38]
9025 palignr m1, m3, 2 ; [ x 45 44 43 42 41 40 39]
9026 punpckhwd m2, m3, m1 ; [ x 35 35 34 34 33 33 32]
9027 punpcklwd m3, m1 ; [32 31 31 30 30 29 29 28]
9028
9029 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
9030 paddd m1, [pd_16]
9031 psrld m1, 5
9032
9033 palignr m0, m2, m3, 4
9034 pmaddwd m0, [r3] ; [16]
9035 paddd m0, [pd_16]
9036 psrld m0, 5
9037 packusdw m1, m0
9038
9039 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
9040
9041 palignr m5, m2, m3, 8
9042 pmaddwd m4, m5, [r3 - 6 * 16] ; [10]
9043 paddd m4, [pd_16]
9044 psrld m4, 5
9045
9046 palignr m5, m2, m3, 12
9047 pmaddwd m1, m5, [r3 - 12 * 16] ; [04]
9048 paddd m1, [pd_16]
9049 psrld m1, 5
9050 packusdw m4, m1
9051
9052 pmaddwd m5, [r3 + 14 * 16] ; [30]
9053 paddd m5, [pd_16]
9054 psrld m5, 5
9055
9056 movu m3, [r2 + 46]
9057 palignr m1, m3, 2
9058 punpckhwd m2, m3, m1
9059 punpcklwd m3, m1
9060
9061 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
9062 paddd m6, [pd_16]
9063 psrld m6, 5
9064 packusdw m5, m6
9065
9066 palignr m6, m2, m3, 4
9067 pmaddwd m6, [r3 + 2 * 16] ; [18]
9068 paddd m6, [pd_16]
9069 psrld m6, 5
9070
9071 palignr m1, m2, m3, 8
9072 pmaddwd m1, [r3 - 4 * 16] ; [12]
9073 paddd m1, [pd_16]
9074 psrld m1, 5
9075 packusdw m6, m1
9076
9077 palignr m1, m2, m3, 12
9078 pmaddwd m1, [r3 - 10 * 16] ; [06]
9079 paddd m1, [pd_16]
9080 psrld m1, 5
9081
9082 packusdw m1, m1
9083 movhps m1, [r2 + 54] ; [00]
9084
9085 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
9086%endmacro
9087;------------------------------------------------------------------------------------------------------------------
9088; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9089;------------------------------------------------------------------------------------------------------------------
9090INIT_XMM sse4
9091cglobal intra_pred_ang32_3, 3,6,8
9092 lea r3, [ang_table + 16 * 16]
9093 mov r4d, 8
9094 add r1, r1
9095 lea r5, [r1 * 3]
9096
9097.loop:
9098 MODE_3_33 1
9099 lea r0, [r0 + r1 * 4 ]
9100 add r2, 8
9101 dec r4
9102 jnz .loop
9103 RET
9104
9105%macro MODE_4_32 1
9106 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9107 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9108 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
9109 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
9110 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
9111
9112 pmaddwd m4, m0, [r3 + 5 * 16] ; [21]
9113 paddd m4, [pd_16]
9114 psrld m4, 5
9115
9116 palignr m5, m2, m0, 4 ; [6 5 5 4 4 3 3 2]
9117 pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
9118 paddd m1, [pd_16]
9119 psrld m1, 5
9120 packusdw m4, m1
9121
9122 pmaddwd m5, [r3 + 15 * 16] ; [31]
9123 paddd m5, [pd_16]
9124 psrld m5, 5
9125
9126 palignr m6, m2, m0, 8
9127 pmaddwd m6, [r3 + 4 * 16] ; [ 20]
9128 paddd m6, [pd_16]
9129 psrld m6, 5
9130 packusdw m5, m6
9131
9132 palignr m1, m2, m0, 12
9133 pmaddwd m6, m1, [r3 - 7 * 16] ; [ 9]
9134 paddd m6, [pd_16]
9135 psrld m6, 5
9136
9137 pmaddwd m1, [r3 + 14 * 16] ; [30]
9138 paddd m1, [pd_16]
9139 psrld m1, 5
9140 packusdw m6, m1
9141
9142 pmaddwd m1, m2, [r3 + 3 * 16] ; [19]
9143 paddd m1, [pd_16]
9144 psrld m1, 5
9145
9146 palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
9147 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
9148 paddd m0, [pd_16]
9149 psrld m0, 5
9150 packusdw m1, m0
9151
9152 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9153
9154 pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
9155 paddd m4, [pd_16]
9156 psrld m4, 5
9157
9158 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
9159
9160 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
9161 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
9162 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
9163 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
9164
9165 palignr m1, m2, m7, 4 ; [11 10 10 9 9 8 7 6]
9166 pmaddwd m1, [r3 + 2 * 16] ; [18]
9167 paddd m1, [pd_16]
9168 psrld m1, 5
9169 packusdw m4, m1
9170
9171 palignr m5, m2, m7, 8
9172 mova m6, m5
9173 pmaddwd m5, [r3 - 9 * 16] ; [07]
9174 paddd m5, [pd_16]
9175 psrld m5, 5
9176
9177 pmaddwd m6, [r3 + 12 * 16] ; [28]
9178 paddd m6, [pd_16]
9179 psrld m6, 5
9180 packusdw m5, m6
9181
9182 palignr m6, m2, m7, 12
9183 pmaddwd m6, [r3 + 16] ; [17]
9184 paddd m6, [pd_16]
9185 psrld m6, 5
9186
9187 pmaddwd m1, m2, [r3 - 10 * 16] ; [06]
9188 paddd m1, [pd_16]
9189 psrld m1, 5
9190 packusdw m6, m1
9191
9192 pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
9193 paddd m1, [pd_16]
9194 psrld m1, 5
9195
9196 palignr m7, m3, m2, 4
9197 pmaddwd m7, [r3] ; [16]
9198 paddd m7, [pd_16]
9199 psrld m7, 5
9200 packusdw m1, m7
9201 mova m7, m0
9202
9203 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
9204
9205 palignr m0, m3, m2, 8
9206 pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
9207 paddd m4, [pd_16]
9208 psrld m4, 5
9209
9210 pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
9211 paddd m1, [pd_16]
9212 psrld m1, 5
9213 packusdw m4, m1
9214
9215 palignr m5, m3, m2, 12
9216 pmaddwd m5, [r3 - 16] ; [15]
9217 paddd m5, [pd_16]
9218 psrld m5, 5
9219
9220 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
9221 paddd m1, [pd_16]
9222 psrld m1, 5
9223 packusdw m5, m1
9224
9225 pmaddwd m6, m3, [r3 + 9 * 16] ; [25]
9226 paddd m6, [pd_16]
9227 psrld m6, 5
9228
9229 movu m0, [r2 + 50] ; [32 31 30 29 28 27 26 25]
9230 palignr m2, m0, m7, 2 ; [25 24 23 22 21 20 19 18]
9231 palignr m1, m0, m7, 4 ; [26 25 24 23 22 21 20 19]
9232 punpckhwd m7, m2, m1 ; [26 25 25 24 24 23 23 22]
9233 punpcklwd m2, m1 ; [22 21 21 20 20 19 19 18]
9234
9235 palignr m1, m2, m3, 4
9236 pmaddwd m1, [r3 - 2 * 16] ; [14]
9237 paddd m1, [pd_16]
9238 psrld m1, 5
9239 packusdw m6, m1
9240
9241 palignr m1, m2, m3, 8
9242 mova m0, m1
9243 pmaddwd m1, [r3 - 13 * 16] ; [3]
9244 paddd m1, [pd_16]
9245 psrld m1, 5
9246
9247 pmaddwd m0, [r3 + 8 * 16] ; [24]
9248 paddd m0, [pd_16]
9249 psrld m0, 5
9250 packusdw m1, m0
9251
9252 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
9253
9254 palignr m4, m2, m3, 12
9255 pmaddwd m4, [r3 - 3 * 16] ; [13]
9256 paddd m4, [pd_16]
9257 psrld m4, 5
9258
9259 pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
9260 paddd m1, [pd_16]
9261 psrld m1, 5
9262 packusdw m4, m1
9263
9264 pmaddwd m5, m2, [r3 + 7 * 16] ; [23]
9265 paddd m5, [pd_16]
9266 psrld m5, 5
9267
9268 palignr m6, m7, m2, 4
9269 pmaddwd m6, [r3 - 4 * 16] ; [12]
9270 paddd m6, [pd_16]
9271 psrld m6, 5
9272 packusdw m5, m6
9273
9274 palignr m1, m7, m2, 8
9275 pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
9276 paddd m6, [pd_16]
9277 psrld m6, 5
9278
9279 pmaddwd m1, [r3 + 6 * 16] ; [22]
9280 paddd m1, [pd_16]
9281 psrld m1, 5
9282 packusdw m6, m1
9283
9284 palignr m1, m7, m2, 12
9285 pmaddwd m1, [r3 - 5 * 16] ; [11]
9286 paddd m1, [pd_16]
9287 psrld m1, 5
9288 packusdw m1, m1
9289 movhps m1, [r2 + 44] ; [00]
9290
9291 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
9292%endmacro
9293;------------------------------------------------------------------------------------------------------------------
9294; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9295;------------------------------------------------------------------------------------------------------------------
9296INIT_XMM sse4
9297cglobal intra_pred_ang32_4, 3,6,8
9298 lea r3, [ang_table + 16 * 16]
9299 mov r4d, 8
9300 add r1, r1
9301 lea r5, [r1 * 3]
9302
9303.loop:
9304 MODE_4_32 1
9305 lea r0, [r0 + r1 * 4 ]
9306 add r2, 8
9307 dec r4
9308 jnz .loop
9309 RET
9310
9311%macro MODE_5_31 1
9312 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9313 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9314 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
9315 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
9316 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
9317
9318 pmaddwd m4, m0, [r3 + 16] ; [17]
9319 paddd m4, [pd_16]
9320 psrld m4, 5
9321
9322 palignr m1, m2, m0, 4
9323 mova m5, m1
9324 pmaddwd m1, [r3 - 14 * 16] ; [2]
9325 paddd m1, [pd_16]
9326 psrld m1, 5
9327 packusdw m4, m1
9328
9329 pmaddwd m5, [r3 + 3 * 16] ; [19]
9330 paddd m5, [pd_16]
9331 psrld m5, 5
9332
9333 palignr m6, m2, m0, 8
9334 mova m1, m6
9335 pmaddwd m6, [r3 - 12 * 16] ; [4]
9336 paddd m6, [pd_16]
9337 psrld m6, 5
9338 packusdw m5, m6
9339
9340 pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
9341 paddd m6, [pd_16]
9342 psrld m6, 5
9343
9344 palignr m1, m2, m0, 12
9345 mova m7, m1
9346 pmaddwd m7, [r3 - 10 * 16] ; [6]
9347 paddd m7, [pd_16]
9348 psrld m7, 5
9349 packusdw m6, m7
9350
9351 pmaddwd m1, [r3 + 7 * 16] ; [23]
9352 paddd m1, [pd_16]
9353 psrld m1, 5
9354
9355 pmaddwd m7, m2, [r3 - 8 * 16] ; [8]
9356 paddd m7, [pd_16]
9357 psrld m7, 5
9358 packusdw m1, m7
9359
9360 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9361
9362 pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
9363 paddd m4, [pd_16]
9364 psrld m4, 5
9365
9366 palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
9367 pmaddwd m1, m7, [r3 - 6 * 16] ; [10]
9368 paddd m1, [pd_16]
9369 psrld m1, 5
9370 packusdw m4, m1
9371
9372 pmaddwd m5, m7, [r3 + 11 * 16] ; [27]
9373 paddd m5, [pd_16]
9374 psrld m5, 5
9375
9376 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
9377 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
9378 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
9379 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
9380 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
9381
9382 palignr m6, m2, m7, 4
9383 pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
9384 paddd m1, [pd_16]
9385 psrld m1, 5
9386 packusdw m5, m1
9387
9388 pmaddwd m6, [r3 + 13 * 16] ; [29]
9389 paddd m6, [pd_16]
9390 psrld m6, 5
9391
9392 palignr m1, m2, m7, 8
9393 mova m0, m1
9394 pmaddwd m1, [r3 - 2 * 16] ; [14]
9395 paddd m1, [pd_16]
9396 psrld m1, 5
9397 packusdw m6, m1
9398
9399 pmaddwd m1, m0, [r3 + 15 * 16] ; [31]
9400 paddd m1, [pd_16]
9401 psrld m1, 5
9402
9403 palignr m0, m2, m7, 12
9404 pmaddwd m0, [r3] ; [16]
9405 paddd m0, [pd_16]
9406 psrld m0, 5
9407 packusdw m1, m0
9408
9409 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
9410
9411 pmaddwd m4, m2, [r3 - 15 * 16] ; [1]
9412 paddd m4, [pd_16]
9413 psrld m4, 5
9414
9415 pmaddwd m1, m2, [r3 + 2 * 16] ; [18]
9416 paddd m1, [pd_16]
9417 psrld m1, 5
9418 packusdw m4, m1
9419
9420 palignr m1, m3, m2, 4
9421 pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
9422 paddd m5, [pd_16]
9423 psrld m5, 5
9424
9425 pmaddwd m1, [r3 + 4 * 16] ; [20]
9426 paddd m1, [pd_16]
9427 psrld m1, 5
9428 packusdw m5, m1
9429
9430 palignr m1, m3, m2, 8
9431 pmaddwd m6, m1, [r3 - 11 * 16] ; [5]
9432 paddd m6, [pd_16]
9433 psrld m6, 5
9434
9435 pmaddwd m1, [r3 + 6 * 16] ; [22]
9436 paddd m1, [pd_16]
9437 psrld m1, 5
9438 packusdw m6, m1
9439
9440 palignr m7, m3, m2, 12
9441 pmaddwd m1, m7, [r3 - 9 * 16] ; [7]
9442 paddd m1, [pd_16]
9443 psrld m1, 5
9444
9445 pmaddwd m7, [r3 + 8 * 16] ; [24]
9446 paddd m7, [pd_16]
9447 psrld m7, 5
9448 packusdw m1, m7
9449
9450 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
9451
9452 pmaddwd m4, m3, [r3 - 7 * 16] ; [9]
9453 paddd m4, [pd_16]
9454 psrld m4, 5
9455
9456 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
9457 paddd m1, [pd_16]
9458 psrld m1, 5
9459 packusdw m4, m1
9460
9461 movu m0, [r2 + 36] ; [25 24 23 22 21 20 19 18]
9462 palignr m1, m0, 2 ; [x 25 24 23 22 21 20 19]
9463 punpcklwd m0, m1 ; [22 21 21 20 20 19 19 18]
9464
9465 palignr m1, m0, m3, 4
9466 pmaddwd m5, m1, [r3 - 5 * 16] ; [11]
9467 paddd m5, [pd_16]
9468 psrld m5, 5
9469
9470 pmaddwd m1, [r3 + 12 * 16] ; [28]
9471 paddd m1, [pd_16]
9472 psrld m1, 5
9473 packusdw m5, m1
9474
9475 palignr m1, m0, m3, 8
9476 pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
9477 paddd m6, [pd_16]
9478 psrld m6, 5
9479
9480 pmaddwd m1, [r3 + 14 * 16] ; [30]
9481 paddd m1, [pd_16]
9482 psrld m1, 5
9483 packusdw m6, m1
9484
9485 palignr m1, m0, m3, 12
9486 pmaddwd m1, [r3 - 16] ; [15]
9487 paddd m1, [pd_16]
9488 psrld m1, 5
9489 packusdw m1, m1
9490 movhps m1, [r2 + 36] ; [00]
9491
9492 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
9493%endmacro
9494;------------------------------------------------------------------------------------------------------------------
9495; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9496;------------------------------------------------------------------------------------------------------------------
9497INIT_XMM sse4
9498cglobal intra_pred_ang32_5, 3,6,8
9499 lea r3, [ang_table + 16 * 16]
9500 mov r4d, 8
9501 add r1, r1
9502 lea r5, [r1 * 3]
9503
9504.loop:
9505 MODE_5_31 1
9506 lea r0, [r0 + r1 * 4 ]
9507 add r2, 8
9508 dec r4
9509 jnz .loop
9510 RET
9511
9512%macro MODE_6_30 1
9513 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9514 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9515 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
9516 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
9517 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
9518
9519 pmaddwd m4, m0, [r3 - 3 * 16] ; [13]
9520 paddd m4, [pd_16]
9521 psrld m4, 5
9522
9523 pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
9524 paddd m1, [pd_16]
9525 psrld m1, 5
9526 packusdw m4, m1
9527
9528 palignr m1, m2, m0, 4
9529 pmaddwd m5, m1, [r3 - 9 * 16] ; [7]
9530 paddd m5, [pd_16]
9531 psrld m5, 5
9532
9533 pmaddwd m1, [r3 + 4 * 16] ; [20]
9534 paddd m1, [pd_16]
9535 psrld m1, 5
9536 packusdw m5, m1
9537
9538 palignr m1, m2, m0, 8
9539 pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
9540 paddd m6, [pd_16]
9541 psrld m6, 5
9542
9543 pmaddwd m7, m1, [r3 - 2 * 16] ; [14]
9544 paddd m7, [pd_16]
9545 psrld m7, 5
9546 packusdw m6, m7
9547
9548 pmaddwd m1, [r3 + 11 * 16] ; [27]
9549 paddd m1, [pd_16]
9550 psrld m1, 5
9551
9552 palignr m7, m2, m0, 12
9553 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
9554 paddd m0, [pd_16]
9555 psrld m0, 5
9556 packusdw m1, m0
9557
9558 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9559
9560 pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
9561 paddd m4, [pd_16]
9562 psrld m4, 5
9563
9564 pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
9565 paddd m1, [pd_16]
9566 psrld m1, 5
9567 packusdw m4, m1
9568
9569 pmaddwd m5, m2, [r3 - 16] ; [15]
9570 paddd m5, [pd_16]
9571 psrld m5, 5
9572
9573 pmaddwd m6, m2, [r3 + 12 * 16] ; [28]
9574 paddd m6, [pd_16]
9575 psrld m6, 5
9576 packusdw m5, m6
9577
9578 palignr m7, m3, m2, 4
9579 pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
9580 paddd m6, [pd_16]
9581 psrld m6, 5
9582
9583 pmaddwd m1, m7, [r3 + 6 * 16] ; [22]
9584 paddd m1, [pd_16]
9585 psrld m1, 5
9586 packusdw m6, m1
9587
9588 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
9589 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
9590 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
9591 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
9592 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
9593
9594 palignr m0, m2, m7, 4
9595 pmaddwd m1, m0, [r3 - 13 * 16] ; [3]
9596 paddd m1, [pd_16]
9597 psrld m1, 5
9598
9599 pmaddwd m0, [r3] ; [16]
9600 paddd m0, [pd_16]
9601 psrld m0, 5
9602 packusdw m1, m0
9603
9604 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
9605
9606 palignr m4, m2, m7, 4
9607 pmaddwd m4, [r3 + 13 * 16] ; [29]
9608 paddd m4, [pd_16]
9609 psrld m4, 5
9610
9611 palignr m5, m2, m7, 8
9612 pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
9613 paddd m1, [pd_16]
9614 psrld m1, 5
9615 packusdw m4, m1
9616
9617 pmaddwd m5, [r3 + 7 * 16] ; [23]
9618 paddd m5, [pd_16]
9619 psrld m5, 5
9620
9621 palignr m1, m2, m7, 12
9622 pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
9623 paddd m6, [pd_16]
9624 psrld m6, 5
9625 packusdw m5, m6
9626
9627 pmaddwd m6, m1, [r3 + 16] ; [17]
9628 paddd m6, [pd_16]
9629 psrld m6, 5
9630
9631 pmaddwd m1, [r3 + 14 * 16] ; [30]
9632 paddd m1, [pd_16]
9633 psrld m1, 5
9634 packusdw m6, m1
9635
9636 pmaddwd m1, m2, [r3 - 5 * 16] ; [11]
9637 paddd m1, [pd_16]
9638 psrld m1, 5
9639
9640 pmaddwd m0, m2, [r3 + 8 * 16] ; [24]
9641 paddd m0, [pd_16]
9642 psrld m0, 5
9643 packusdw m1, m0
9644
9645 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
9646
9647 palignr m5, m3, m2, 4
9648 pmaddwd m4, m5, [r3 - 11 * 16] ; [5]
9649 paddd m4, [pd_16]
9650 psrld m4, 5
9651
9652 pmaddwd m1, m5, [r3 + 2 * 16] ; [18]
9653 paddd m1, [pd_16]
9654 psrld m1, 5
9655 packusdw m4, m1
9656
9657 pmaddwd m5, [r3 + 15 * 16] ; [31]
9658 paddd m5, [pd_16]
9659 psrld m5, 5
9660
9661 palignr m6, m3, m2, 8
9662 pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
9663 paddd m1, [pd_16]
9664 psrld m1, 5
9665 packusdw m5, m1
9666
9667 pmaddwd m6, [r3 + 9 * 16] ; [25]
9668 paddd m6, [pd_16]
9669 psrld m6, 5
9670
9671 palignr m1, m3, m2, 12
9672 pmaddwd m0, m1, [r3 - 10 * 16] ; [6]
9673 paddd m0, [pd_16]
9674 psrld m0, 5
9675 packusdw m6, m0
9676
9677 pmaddwd m1, [r3 + 3 * 16] ; [19]
9678 paddd m1, [pd_16]
9679 psrld m1, 5
9680 packusdw m1, m1
9681 movhps m1, [r2 + 28] ; [00]
9682
9683 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
9684%endmacro
9685;------------------------------------------------------------------------------------------------------------------
9686; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9687;------------------------------------------------------------------------------------------------------------------
9688INIT_XMM sse4
9689cglobal intra_pred_ang32_6, 3,6,8
9690 lea r3, [ang_table + 16 * 16]
9691 mov r4d, 8
9692 add r1, r1
9693 lea r5, [r1 * 3]
9694
9695.loop:
9696 MODE_6_30 1
9697 lea r0, [r0 + r1 * 4 ]
9698 add r2, 8
9699 dec r4
9700 jnz .loop
9701 RET
9702
9703%macro MODE_7_29 1
9704 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9705 movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9706 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
9707 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
9708 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
9709
9710 pmaddwd m4, m0, [r3 - 7 * 16] ; [9]
9711 paddd m4, [pd_16]
9712 psrld m4, 5
9713
9714 pmaddwd m1, m0, [r3 + 2 * 16] ; [18]
9715 paddd m1, [pd_16]
9716 psrld m1, 5
9717 packusdw m4, m1
9718
9719 pmaddwd m5, m0, [r3 + 11 * 16] ; [27]
9720 paddd m5, [pd_16]
9721 psrld m5, 5
9722
9723 palignr m1, m2, m0, 4
9724 pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
9725 paddd m6, [pd_16]
9726 psrld m6, 5
9727 packusdw m5, m6
9728
9729 pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
9730 paddd m6, [pd_16]
9731 psrld m6, 5
9732
9733 pmaddwd m7, m1, [r3 + 6 * 16] ; [22]
9734 paddd m7, [pd_16]
9735 psrld m7, 5
9736 packusdw m6, m7
9737
9738 pmaddwd m1, [r3 + 15 * 16] ; [31]
9739 paddd m1, [pd_16]
9740 psrld m1, 5
9741
9742 mova m3, m0
9743 palignr m7, m2, m0, 8
9744 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
9745 paddd m0, [pd_16]
9746 psrld m0, 5
9747 packusdw m1, m0
9748
9749 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9750
9751 pmaddwd m4, m7, [r3 + 16] ; [17]
9752 paddd m4, [pd_16]
9753 psrld m4, 5
9754
9755 pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
9756 paddd m1, [pd_16]
9757 psrld m1, 5
9758 packusdw m4, m1
9759
9760 palignr m1, m2, m3, 12
9761 pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
9762 paddd m5, [pd_16]
9763 psrld m5, 5
9764
9765 pmaddwd m6, m1, [r3 - 4 * 16] ; [12]
9766 paddd m6, [pd_16]
9767 psrld m6, 5
9768 packusdw m5, m6
9769
9770 pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
9771 paddd m6, [pd_16]
9772 psrld m6, 5
9773
9774 pmaddwd m1, [r3 + 14 * 16] ; [30]
9775 paddd m1, [pd_16]
9776 psrld m1, 5
9777 packusdw m6, m1
9778
9779 pmaddwd m1, m2, [r3 - 9 * 16] ; [7]
9780 paddd m1, [pd_16]
9781 psrld m1, 5
9782
9783 pmaddwd m0, m2, [r3] ; [16]
9784 paddd m0, [pd_16]
9785 psrld m0, 5
9786 packusdw m1, m0
9787
9788 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
9789
9790 pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
9791 paddd m4, [pd_16]
9792 psrld m4, 5
9793
9794 movu m7, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9795 palignr m1, m7, 2 ; [x 16 15 14 13 12 11 10]
9796 punpcklwd m7, m1 ; [13 12 12 11 11 10 10 9]
9797
9798 palignr m6, m7, m2, 4
9799 pmaddwd m1, m6, [r3 - 14 * 16] ; [2]
9800 paddd m1, [pd_16]
9801 psrld m1, 5
9802 packusdw m4, m1
9803
9804 pmaddwd m5, m6, [r3 - 5 * 16] ; [11]
9805 paddd m5, [pd_16]
9806 psrld m5, 5
9807
9808 pmaddwd m0, m6, [r3 + 4 * 16] ; [20]
9809 paddd m0, [pd_16]
9810 psrld m0, 5
9811 packusdw m5, m0
9812
9813 pmaddwd m6, [r3 + 13 * 16] ; [29]
9814 paddd m6, [pd_16]
9815 psrld m6, 5
9816
9817 palignr m0, m7, m2, 8
9818 pmaddwd m1, m0, [r3 - 10 * 16] ; [6]
9819 paddd m1, [pd_16]
9820 psrld m1, 5
9821 packusdw m6, m1
9822
9823 pmaddwd m1, m0, [r3 - 16] ; [15]
9824 paddd m1, [pd_16]
9825 psrld m1, 5
9826
9827 pmaddwd m0, [r3 + 8 * 16] ; [24]
9828 paddd m0, [pd_16]
9829 psrld m0, 5
9830 packusdw m1, m0
9831
9832 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
9833
9834 palignr m0, m7, m2, 12
9835 pmaddwd m4, m0, [r3 - 15 * 16] ; [1]
9836 paddd m4, [pd_16]
9837 psrld m4, 5
9838
9839 pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
9840 paddd m1, [pd_16]
9841 psrld m1, 5
9842 packusdw m4, m1
9843
9844 pmaddwd m5, m0, [r3 + 3 * 16] ; [19]
9845 paddd m5, [pd_16]
9846 psrld m5, 5
9847
9848 pmaddwd m0, [r3 + 12 * 16] ; [28]
9849 paddd m0, [pd_16]
9850 psrld m0, 5
9851 packusdw m5, m0
9852
9853 pmaddwd m6, m7, [r3 - 11 * 16] ; [5]
9854 paddd m6, [pd_16]
9855 psrld m6, 5
9856
9857 pmaddwd m0, m7, [r3 - 2 * 16] ; [14]
9858 paddd m0, [pd_16]
9859 psrld m0, 5
9860 packusdw m6, m0
9861
9862 pmaddwd m1, m7, [r3 + 7 * 16] ; [23]
9863 paddd m1, [pd_16]
9864 psrld m1, 5
9865 packusdw m1, m1
9866 movhps m1, [r2 + 20] ; [00]
9867
9868 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
9869%endmacro
9870;------------------------------------------------------------------------------------------------------------------
9871; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9872;------------------------------------------------------------------------------------------------------------------
9873INIT_XMM sse4
9874cglobal intra_pred_ang32_7, 3,6,8
9875 lea r3, [ang_table + 16 * 16]
9876 mov r4d, 8
9877 add r1, r1
9878 lea r5, [r1 * 3]
9879
9880.loop:
9881 MODE_7_29 1
9882 lea r0, [r0 + r1 * 4 ]
9883 add r2, 8
9884 dec r4
9885 jnz .loop
9886 RET
9887
9888%macro MODE_8_28 1
9889 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9890 movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9891 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
9892 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
9893 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
9894
9895 pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
9896 paddd m4, [pd_16]
9897 psrld m4, 5
9898
9899 pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
9900 paddd m1, [pd_16]
9901 psrld m1, 5
9902 packusdw m4, m1
9903
9904 pmaddwd m5, m0, [r3 - 16] ; [15]
9905 paddd m5, [pd_16]
9906 psrld m5, 5
9907
9908 pmaddwd m6, m0, [r3 + 4 * 16] ; [20]
9909 paddd m6, [pd_16]
9910 psrld m6, 5
9911 packusdw m5, m6
9912
9913 pmaddwd m6, m0, [r3 + 9 * 16] ; [25]
9914 paddd m6, [pd_16]
9915 psrld m6, 5
9916
9917 pmaddwd m1, m0, [r3 + 14 * 16] ; [30]
9918 paddd m1, [pd_16]
9919 psrld m1, 5
9920 packusdw m6, m1
9921
9922 palignr m7, m2, m0, 4
9923 pmaddwd m1, m7, [r3 - 13 * 16] ; [3]
9924 paddd m1, [pd_16]
9925 psrld m1, 5
9926
9927 mova m3, m0
9928 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
9929 paddd m0, [pd_16]
9930 psrld m0, 5
9931 packusdw m1, m0
9932
9933 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9934
9935 pmaddwd m4, m7, [r3 - 3 * 16] ; [13]
9936 paddd m4, [pd_16]
9937 psrld m4, 5
9938
9939 pmaddwd m1, m7, [r3 + 2 * 16] ; [18]
9940 paddd m1, [pd_16]
9941 psrld m1, 5
9942 packusdw m4, m1
9943
9944 pmaddwd m5, m7, [r3 + 7 * 16] ; [23]
9945 paddd m5, [pd_16]
9946 psrld m5, 5
9947
9948 pmaddwd m6, m7, [r3 + 12 * 16] ; [28]
9949 paddd m6, [pd_16]
9950 psrld m6, 5
9951 packusdw m5, m6
9952
9953 palignr m7, m2, m3, 8
9954 pmaddwd m6, m7, [r3 - 15 * 16] ; [1]
9955 paddd m6, [pd_16]
9956 psrld m6, 5
9957
9958 pmaddwd m1, m7, [r3 - 10 * 16] ; [6]
9959 paddd m1, [pd_16]
9960 psrld m1, 5
9961 packusdw m6, m1
9962
9963 pmaddwd m1, m7, [r3 - 5 * 16] ; [11]
9964 paddd m1, [pd_16]
9965 psrld m1, 5
9966
9967 pmaddwd m0, m7, [r3] ; [16]
9968 paddd m0, [pd_16]
9969 psrld m0, 5
9970 packusdw m1, m0
9971
9972 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
9973
9974 pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
9975 paddd m4, [pd_16]
9976 psrld m4, 5
9977
9978 pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
9979 paddd m1, [pd_16]
9980 psrld m1, 5
9981 packusdw m4, m1
9982
9983 pmaddwd m5, m7, [r3 + 15 * 16] ; [31]
9984 paddd m5, [pd_16]
9985 psrld m5, 5
9986
9987 palignr m7, m2, m3, 12
9988 pmaddwd m0, m7, [r3 - 12 * 16] ; [4]
9989 paddd m0, [pd_16]
9990 psrld m0, 5
9991 packusdw m5, m0
9992
9993 pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
9994 paddd m6, [pd_16]
9995 psrld m6, 5
9996
9997 pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
9998 paddd m1, [pd_16]
9999 psrld m1, 5
10000 packusdw m6, m1
10001
10002 pmaddwd m1, m7, [r3 + 3 * 16] ; [19]
10003 paddd m1, [pd_16]
10004 psrld m1, 5
10005
10006 pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
10007 paddd m0, [pd_16]
10008 psrld m0, 5
10009 packusdw m1, m0
10010
10011 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
10012
10013 pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
10014 paddd m4, [pd_16]
10015 psrld m4, 5
10016
10017 pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
10018 paddd m1, [pd_16]
10019 psrld m1, 5
10020 packusdw m4, m1
10021
10022 pmaddwd m5, m2, [r3 - 9 * 16] ; [7]
10023 paddd m5, [pd_16]
10024 psrld m5, 5
10025
10026 pmaddwd m0, m2, [r3 - 4 * 16] ; [12]
10027 paddd m0, [pd_16]
10028 psrld m0, 5
10029 packusdw m5, m0
10030
10031 pmaddwd m6, m2, [r3 + 16] ; [17]
10032 paddd m6, [pd_16]
10033 psrld m6, 5
10034
10035 pmaddwd m0, m2, [r3 + 6 * 16] ; [22]
10036 paddd m0, [pd_16]
10037 psrld m0, 5
10038 packusdw m6, m0
10039
10040 pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
10041 paddd m1, [pd_16]
10042 psrld m1, 5
10043 packusdw m1, m1
10044 movhps m1, [r2 + 12] ; [00]
10045
10046 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
10047%endmacro
10048;------------------------------------------------------------------------------------------------------------------
10049; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10050;------------------------------------------------------------------------------------------------------------------
10051INIT_XMM sse4
10052cglobal intra_pred_ang32_8, 3,6,8
10053 lea r3, [ang_table + 16 * 16]
10054 mov r4d, 8
10055 add r1, r1
10056 lea r5, [r1 * 3]
10057
10058.loop:
10059 MODE_8_28 1
10060 lea r0, [r0 + r1 * 4 ]
10061 add r2, 8
10062 dec r4
10063 jnz .loop
10064 RET
10065
10066%macro MODE_9_27 1
10067 movu m3, [r2 + 2] ; [8 7 6 5 4 3 2 1]
10068 palignr m1, m3, 2 ; [9 8 7 6 5 4 3 2]
10069 punpckhwd m2, m3, m1 ; [9 8 8 7 7 6 6 5]
10070 punpcklwd m3, m1 ; [5 4 4 3 3 2 2 1]
10071
10072 pmaddwd m4, m3, [r3 - 14 * 16] ; [2]
10073 paddd m4, [pd_16]
10074 psrld m4, 5
10075
10076 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
10077 paddd m1, [pd_16]
10078 psrld m1, 5
10079 packusdw m4, m1
10080
10081 pmaddwd m5, m3, [r3 - 10 * 16] ; [6]
10082 paddd m5, [pd_16]
10083 psrld m5, 5
10084
10085 pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
10086 paddd m6, [pd_16]
10087 psrld m6, 5
10088 packusdw m5, m6
10089
10090 pmaddwd m6, m3, [r3 - 6 * 16] ; [10]
10091 paddd m6, [pd_16]
10092 psrld m6, 5
10093
10094 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
10095 paddd m1, [pd_16]
10096 psrld m1, 5
10097 packusdw m6, m1
10098
10099 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
10100 paddd m1, [pd_16]
10101 psrld m1, 5
10102
10103 pmaddwd m0, m3, [r3] ; [16]
10104 paddd m0, [pd_16]
10105 psrld m0, 5
10106 packusdw m1, m0
10107
10108 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10109
10110 pmaddwd m4, m3, [r3 + 2 * 16] ; [18]
10111 paddd m4, [pd_16]
10112 psrld m4, 5
10113
10114 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
10115 paddd m1, [pd_16]
10116 psrld m1, 5
10117 packusdw m4, m1
10118
10119 pmaddwd m5, m3, [r3 + 6 * 16] ; [22]
10120 paddd m5, [pd_16]
10121 psrld m5, 5
10122
10123 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
10124 paddd m6, [pd_16]
10125 psrld m6, 5
10126 packusdw m5, m6
10127
10128 pmaddwd m6, m3, [r3 + 10 * 16] ; [26]
10129 paddd m6, [pd_16]
10130 psrld m6, 5
10131
10132 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
10133 paddd m1, [pd_16]
10134 psrld m1, 5
10135 packusdw m6, m1
10136
10137 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
10138 paddd m1, [pd_16]
10139 psrld m1, 5
10140
10141 packusdw m1, m1
10142 movhps m1, [r2 + 4] ; [00]
10143
10144 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
10145
10146 palignr m7, m2, m3, 4
10147 pmaddwd m4, m7, [r3 - 14 * 16] ; [2]
10148 paddd m4, [pd_16]
10149 psrld m4, 5
10150
10151 pmaddwd m1, m7, [r3 - 12 * 16] ; [4]
10152 paddd m1, [pd_16]
10153 psrld m1, 5
10154 packusdw m4, m1
10155
10156 pmaddwd m5, m7, [r3 - 10 * 16] ; [6]
10157 paddd m5, [pd_16]
10158 psrld m5, 5
10159
10160 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
10161 paddd m0, [pd_16]
10162 psrld m0, 5
10163 packusdw m5, m0
10164
10165 pmaddwd m6, m7, [r3 - 6 * 16] ; [10]
10166 paddd m6, [pd_16]
10167 psrld m6, 5
10168
10169 pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
10170 paddd m1, [pd_16]
10171 psrld m1, 5
10172 packusdw m6, m1
10173
10174 pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
10175 paddd m1, [pd_16]
10176 psrld m1, 5
10177
10178 pmaddwd m0, m7, [r3] ; [16]
10179 paddd m0, [pd_16]
10180 psrld m0, 5
10181 packusdw m1, m0
10182
10183 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
10184
10185 pmaddwd m4, m7, [r3 + 2 * 16] ; [18]
10186 paddd m4, [pd_16]
10187 psrld m4, 5
10188
10189 pmaddwd m1, m7, [r3 + 4 * 16] ; [20]
10190 paddd m1, [pd_16]
10191 psrld m1, 5
10192 packusdw m4, m1
10193
10194 pmaddwd m5, m7, [r3 + 6 * 16] ; [22]
10195 paddd m5, [pd_16]
10196 psrld m5, 5
10197
10198 pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
10199 paddd m0, [pd_16]
10200 psrld m0, 5
10201 packusdw m5, m0
10202
10203 pmaddwd m6, m7, [r3 + 10 * 16] ; [26]
10204 paddd m6, [pd_16]
10205 psrld m6, 5
10206
10207 pmaddwd m0, m7, [r3 + 12 * 16] ; [28]
10208 paddd m0, [pd_16]
10209 psrld m0, 5
10210 packusdw m6, m0
10211
10212 pmaddwd m7, [r3 + 14 * 16] ; [30]
10213 paddd m7, [pd_16]
10214 psrld m7, 5
10215 packusdw m7, m7
10216 movhps m7, [r2 + 6] ; [00]
10217
10218 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7
10219%endmacro
10220;------------------------------------------------------------------------------------------------------------------
10221; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10222;------------------------------------------------------------------------------------------------------------------
10223INIT_XMM sse4
10224cglobal intra_pred_ang32_9, 3,6,8
10225 lea r3, [ang_table + 16 * 16]
10226 mov r4d, 8
10227 add r1, r1
10228 lea r5, [r1 * 3]
10229
10230.loop:
10231 MODE_9_27 1
10232 lea r0, [r0 + r1 * 4 ]
10233 add r2, 8
10234 dec r4
10235 jnz .loop
10236 RET
10237
10238;------------------------------------------------------------------------------------------------------------------
10239; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10240;------------------------------------------------------------------------------------------------------------------
10241INIT_XMM sse4
10242cglobal intra_pred_ang32_10, 4,7,8
10243 mov r6d, 4
10244 add r1, r1
10245 lea r5, [r1 * 3]
10246 lea r4, [r1 * 2]
10247 lea r3, [r1 * 4]
10248 mova m7, [c_mode32_10_0]
10249
10250.loop:
10251 movu m0, [r2 + 2]
10252 pshufb m1, m0, m7
10253 movu [r0], m1
10254 movu [r0 + 16], m1
10255 movu [r0 + 32], m1
10256 movu [r0 + 48], m1
10257
10258 palignr m1, m0, 2
10259 pshufb m1, m7
10260 movu [r0 + r1], m1
10261 movu [r0 + r1 + 16], m1
10262 movu [r0 + r1 + 32], m1
10263 movu [r0 + r1 + 48], m1
10264
10265 palignr m1, m0, 4
10266 pshufb m1, m7
10267 movu [r0 + r4], m1
10268 movu [r0 + r4 + 16], m1
10269 movu [r0 + r4 + 32], m1
10270 movu [r0 + r4 + 48], m1
10271
10272 palignr m1, m0, 6
10273 pshufb m1, m7
10274 movu [r0 + r5], m1
10275 movu [r0 + r5 + 16], m1
10276 movu [r0 + r5 + 32], m1
10277 movu [r0 + r5 + 48], m1
10278
10279 add r0, r3
10280
10281 palignr m1, m0, 8
10282 pshufb m1, m7
10283 movu [r0], m1
10284 movu [r0 + 16], m1
10285 movu [r0 + 32], m1
10286 movu [r0 + 48], m1
10287
10288 palignr m1, m0, 10
10289 pshufb m1, m7
10290 movu [r0 + r1], m1
10291 movu [r0 + r1 + 16], m1
10292 movu [r0 + r1 + 32], m1
10293 movu [r0 + r1 + 48], m1
10294
10295 palignr m1, m0, 12
10296 pshufb m1, m7
10297 movu [r0 + r4], m1
10298 movu [r0 + r4 + 16], m1
10299 movu [r0 + r4 + 32], m1
10300 movu [r0 + r4 + 48], m1
10301
10302 palignr m1, m0, 14
10303 pshufb m1, m7
10304 movu [r0 + r5], m1
10305 movu [r0 + r5 + 16], m1
10306 movu [r0 + r5 + 32], m1
10307 movu [r0 + r5 + 48], m1
10308
10309 add r0, r3
10310 add r2, 16
10311 dec r6d
10312 jnz .loop
10313 RET
10314
10315%macro MODE_11_25 1
10316 movu m3, [r2 + 2] ; [7 6 5 4 3 2 1 0]
10317 pshufb m3, [pw_punpcklwd] ; [4 3 3 2 2 1 1 0]
10318
10319 pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
10320 paddd m4, [pd_16]
10321 psrld m4, 5
10322
10323 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
10324 paddd m1, [pd_16]
10325 psrld m1, 5
10326 packusdw m4, m1
10327
10328 pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
10329 paddd m5, [pd_16]
10330 psrld m5, 5
10331
10332 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
10333 paddd m6, [pd_16]
10334 psrld m6, 5
10335 packusdw m5, m6
10336
10337 pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
10338 paddd m6, [pd_16]
10339 psrld m6, 5
10340
10341 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
10342 paddd m1, [pd_16]
10343 psrld m1, 5
10344 packusdw m6, m1
10345
10346 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
10347 paddd m1, [pd_16]
10348 psrld m1, 5
10349
10350 pmaddwd m0, m3, [r3] ; [16]
10351 paddd m0, [pd_16]
10352 psrld m0, 5
10353 packusdw m1, m0
10354
10355 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10356
10357 pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
10358 paddd m4, [pd_16]
10359 psrld m4, 5
10360
10361 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
10362 paddd m1, [pd_16]
10363 psrld m1, 5
10364 packusdw m4, m1
10365
10366 pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
10367 paddd m5, [pd_16]
10368 psrld m5, 5
10369
10370 pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
10371 paddd m6, [pd_16]
10372 psrld m6, 5
10373 packusdw m5, m6
10374
10375 pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
10376 paddd m6, [pd_16]
10377 psrld m6, 5
10378
10379 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
10380 paddd m1, [pd_16]
10381 psrld m1, 5
10382 packusdw m6, m1
10383
10384 pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
10385 paddd m1, [pd_16]
10386 psrld m1, 5
10387
10388 packusdw m1, m1
10389 movhps m1, [r2 + 2] ; [00]
10390
10391 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
10392
10393 movu m3, [r2] ; [6 5 4 3 2 1 0 16]
10394 pshufb m3, [pw_punpcklwd] ; [3 2 2 1 1 0 0 16]
10395
10396 pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
10397 paddd m4, [pd_16]
10398 psrld m4, 5
10399
10400 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
10401 paddd m1, [pd_16]
10402 psrld m1, 5
10403 packusdw m4, m1
10404
10405 pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
10406 paddd m5, [pd_16]
10407 psrld m5, 5
10408
10409 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
10410 paddd m0, [pd_16]
10411 psrld m0, 5
10412 packusdw m5, m0
10413
10414 pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
10415 paddd m6, [pd_16]
10416 psrld m6, 5
10417
10418 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
10419 paddd m1, [pd_16]
10420 psrld m1, 5
10421 packusdw m6, m1
10422
10423 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
10424 paddd m1, [pd_16]
10425 psrld m1, 5
10426
10427 pmaddwd m0, m3, [r3] ; [16]
10428 paddd m0, [pd_16]
10429 psrld m0, 5
10430 packusdw m1, m0
10431
10432 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
10433
10434 pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
10435 paddd m4, [pd_16]
10436 psrld m4, 5
10437
10438 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
10439 paddd m1, [pd_16]
10440 psrld m1, 5
10441 packusdw m4, m1
10442
10443 pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
10444 paddd m5, [pd_16]
10445 psrld m5, 5
10446
10447 pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
10448 paddd m6, [pd_16]
10449 psrld m6, 5
10450 packusdw m5, m6
10451
10452 pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
10453 paddd m6, [pd_16]
10454 psrld m6, 5
10455
10456 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
10457 paddd m1, [pd_16]
10458 psrld m1, 5
10459 packusdw m6, m1
10460
10461 pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
10462 paddd m1, [pd_16]
10463 psrld m1, 5
10464
10465 packusdw m1, m1
10466 movhps m1, [r2] ; [00]
10467
10468 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
10469%endmacro
10470;------------------------------------------------------------------------------------------------------------------
10471; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10472;------------------------------------------------------------------------------------------------------------------
10473INIT_XMM sse4
10474cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4)
10475 movu m0, [r2 + 0*mmsize]
10476 movu m1, [r2 + 1*mmsize]
10477 movu m2, [r2 + 2*mmsize]
10478 movu m3, [r2 + 3*mmsize]
10479 movu [rsp + 0*mmsize + 2], m0
10480 movu [rsp + 1*mmsize + 2], m1
10481 movu [rsp + 2*mmsize + 2], m2
10482 movu [rsp + 3*mmsize + 2], m3
10483 mov r4w, [r3+32]
10484 mov [rsp], r4w
10485 mov r4w, [r2+64]
10486 mov [rsp+66], r4w
10487
10488 lea r3, [ang_table + 16 * 16]
10489 mov r4d, 8
10490 mov r2, rsp
10491 add r1, r1
10492 lea r5, [r1 * 3]
10493
10494.loop:
10495 MODE_11_25 1
10496 lea r0, [r0 + r1 * 4 ]
10497 add r2, 8
10498 dec r4
10499 jnz .loop
10500 RET
10501
10502%macro MODE_12_24 1
10503 movu m3, [r2 + 8] ; [7 6 5 4 3 2 1 0]
10504 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
10505
10506 pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
10507 paddd m4, [pd_16]
10508 psrld m4, 5
10509
10510 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
10511 paddd m1, [pd_16]
10512 psrld m1, 5
10513 packusdw m4, m1
10514
10515 pmaddwd m5, m3, [r3 + 16] ; [17]
10516 paddd m5, [pd_16]
10517 psrld m5, 5
10518
10519 pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
10520 paddd m6, [pd_16]
10521 psrld m6, 5
10522 packusdw m5, m6
10523
10524 pmaddwd m6, m3, [r3 - 9 * 16] ; [7]
10525 paddd m6, [pd_16]
10526 psrld m6, 5
10527
10528 pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
10529 paddd m1, [pd_16]
10530 psrld m1, 5
10531 packusdw m6, m1
10532
10533 movu m3, [r2 + 6]
10534 pshufb m3, m2
10535
10536 pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
10537 paddd m1, [pd_16]
10538 psrld m1, 5
10539
10540 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
10541 paddd m0, [pd_16]
10542 psrld m0, 5
10543 packusdw m1, m0
10544
10545 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10546
10547 pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
10548 paddd m4, [pd_16]
10549 psrld m4, 5
10550
10551 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
10552 paddd m1, [pd_16]
10553 psrld m1, 5
10554 packusdw m4, m1
10555
10556 pmaddwd m5, m3, [r3 - 7 * 16] ; [9]
10557 paddd m5, [pd_16]
10558 psrld m5, 5
10559
10560 pmaddwd m6, m3, [r3 - 12 * 16] ; [4]
10561 paddd m6, [pd_16]
10562 psrld m6, 5
10563 packusdw m5, m6
10564
10565 movu m3, [r2 + 4]
10566 pshufb m3, m2
10567
10568 pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
10569 paddd m6, [pd_16]
10570 psrld m6, 5
10571
10572 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
10573 paddd m1, [pd_16]
10574 psrld m1, 5
10575 packusdw m6, m1
10576
10577 pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
10578 paddd m1, [pd_16]
10579 psrld m1, 5
10580
10581 pmaddwd m0, m3, [r3] ; [16]
10582 paddd m0, [pd_16]
10583 psrld m0, 5
10584 packusdw m1, m0
10585
10586 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
10587
10588 pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
10589 paddd m4, [pd_16]
10590 psrld m4, 5
10591
10592 pmaddwd m1, m3, [r3 - 10 * 16] ; [6]
10593 paddd m1, [pd_16]
10594 psrld m1, 5
10595 packusdw m4, m1
10596
10597 pmaddwd m5, m3, [r3 - 15 * 16] ; [1]
10598 paddd m5, [pd_16]
10599 psrld m5, 5
10600
10601 movu m3, [r2 + 2]
10602 pshufb m3, m2
10603
10604 pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
10605 paddd m0, [pd_16]
10606 psrld m0, 5
10607 packusdw m5, m0
10608
10609 pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
10610 paddd m6, [pd_16]
10611 psrld m6, 5
10612
10613 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
10614 paddd m1, [pd_16]
10615 psrld m1, 5
10616 packusdw m6, m1
10617
10618 pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
10619 paddd m1, [pd_16]
10620 psrld m1, 5
10621
10622 pmaddwd m0, m3, [r3 - 8 * 16] ; [8]
10623 paddd m0, [pd_16]
10624 psrld m0, 5
10625 packusdw m1, m0
10626
10627 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
10628
10629 pmaddwd m4, m3, [r3 - 13 * 16] ; [3]
10630 paddd m4, [pd_16]
10631 psrld m4, 5
10632
10633 movu m3, [r2]
10634 pshufb m3, m2
10635
10636 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
10637 paddd m1, [pd_16]
10638 psrld m1, 5
10639 packusdw m4, m1
10640
10641 pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
10642 paddd m5, [pd_16]
10643 psrld m5, 5
10644
10645 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
10646 paddd m6, [pd_16]
10647 psrld m6, 5
10648 packusdw m5, m6
10649
10650 pmaddwd m6, m3, [r3 - 16] ; [15]
10651 paddd m6, [pd_16]
10652 psrld m6, 5
10653
10654 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
10655 paddd m1, [pd_16]
10656 psrld m1, 5
10657 packusdw m6, m1
10658
10659 pmaddwd m1, m3, [r3 - 11 * 16] ; [5]
10660 paddd m1, [pd_16]
10661 psrld m1, 5
10662
10663 packusdw m1, m1
10664 movhps m1, [r2] ; [00]
10665
10666 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
10667%endmacro
10668;------------------------------------------------------------------------------------------------------------------
10669; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10670;------------------------------------------------------------------------------------------------------------------
10671INIT_XMM sse4
10672cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10)
10673 movu m0, [r2 + 0*mmsize]
10674 movu m1, [r2 + 1*mmsize]
10675 movu m2, [r2 + 2*mmsize]
10676 movu m3, [r2 + 3*mmsize]
10677 movu [rsp + 0*mmsize + 8], m0
10678 movu [rsp + 1*mmsize + 8], m1
10679 movu [rsp + 2*mmsize + 8], m2
10680 movu [rsp + 3*mmsize + 8], m3
10681
10682 mov r4w, [r2+64]
10683 mov [rsp+72], r4w
10684 mov r4w, [r3+12]
10685 mov [rsp+6], r4w
10686 mov r4w, [r3+26]
10687 mov [rsp+4], r4w
10688 mov r4w, [r3+38]
10689 mov [rsp+2], r4w
10690 mov r4w, [r3+52]
10691 mov [rsp], r4w
10692
10693 lea r3, [ang_table + 16 * 16]
10694 mov r4d, 8
10695 mov r2, rsp
10696 add r1, r1
10697 lea r5, [r1 * 3]
10698 mova m2, [pw_punpcklwd]
10699
10700.loop:
10701 MODE_12_24 1
10702 lea r0, [r0 + r1 * 4 ]
10703 add r2, 8
10704 dec r4
10705 jnz .loop
10706 RET
10707
10708%macro MODE_13_23 1
10709 movu m3, [r2 + 16] ; [7 6 5 4 3 2 1 0]
10710 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
10711
10712 pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
10713 paddd m4, [pd_16]
10714 psrld m4, 5
10715
10716 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
10717 paddd m1, [pd_16]
10718 psrld m1, 5
10719 packusdw m4, m1
10720
10721 pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
10722 paddd m5, [pd_16]
10723 psrld m5, 5
10724
10725 movu m3, [r2 + 14]
10726 pshufb m3, m2
10727
10728 pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
10729 paddd m6, [pd_16]
10730 psrld m6, 5
10731 packusdw m5, m6
10732
10733 pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
10734 paddd m6, [pd_16]
10735 psrld m6, 5
10736
10737 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
10738 paddd m1, [pd_16]
10739 psrld m1, 5
10740 packusdw m6, m1
10741
10742 pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
10743 paddd m1, [pd_16]
10744 psrld m1, 5
10745
10746 movu m3, [r2 + 12]
10747 pshufb m3, m2
10748
10749 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
10750 paddd m0, [pd_16]
10751 psrld m0, 5
10752 packusdw m1, m0
10753
10754 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10755
10756 pmaddwd m4, m3, [r3 - 16] ; [15]
10757 paddd m4, [pd_16]
10758 psrld m4, 5
10759
10760 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
10761 paddd m1, [pd_16]
10762 psrld m1, 5
10763 packusdw m4, m1
10764
10765 movu m3, [r2 + 10]
10766 pshufb m3, m2
10767
10768 pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
10769 paddd m5, [pd_16]
10770 psrld m5, 5
10771
10772 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
10773 paddd m6, [pd_16]
10774 psrld m6, 5
10775 packusdw m5, m6
10776
10777 pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
10778 paddd m6, [pd_16]
10779 psrld m6, 5
10780
10781 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
10782 paddd m1, [pd_16]
10783 psrld m1, 5
10784 packusdw m6, m1
10785
10786 movu m3, [r2 + 8]
10787 pshufb m3, m2
10788
10789 pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
10790 paddd m1, [pd_16]
10791 psrld m1, 5
10792
10793 pmaddwd m0, m3, [r3] ; [16]
10794 paddd m0, [pd_16]
10795 psrld m0, 5
10796 packusdw m1, m0
10797
10798 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
10799
10800 pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
10801 paddd m4, [pd_16]
10802 psrld m4, 5
10803
10804 movu m3, [r2 + 6]
10805 pshufb m3, m2
10806
10807 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
10808 paddd m1, [pd_16]
10809 psrld m1, 5
10810 packusdw m4, m1
10811
10812 pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
10813 paddd m5, [pd_16]
10814 psrld m5, 5
10815
10816 pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
10817 paddd m0, [pd_16]
10818 psrld m0, 5
10819 packusdw m5, m0
10820
10821 pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
10822 paddd m6, [pd_16]
10823 psrld m6, 5
10824
10825 movu m3, [r2 + 4]
10826 pshufb m3, m2
10827
10828 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
10829 paddd m1, [pd_16]
10830 psrld m1, 5
10831 packusdw m6, m1
10832
10833 pmaddwd m1, m3, [r3 + 16] ; [17]
10834 paddd m1, [pd_16]
10835 psrld m1, 5
10836
10837 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
10838 paddd m0, [pd_16]
10839 psrld m0, 5
10840 packusdw m1, m0
10841
10842 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
10843
10844 movu m3, [r2 + 2]
10845 pshufb m3, m2
10846
10847 pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
10848 paddd m4, [pd_16]
10849 psrld m4, 5
10850
10851 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
10852 paddd m1, [pd_16]
10853 psrld m1, 5
10854 packusdw m4, m1
10855
10856 pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
10857 paddd m5, [pd_16]
10858 psrld m5, 5
10859
10860 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
10861 paddd m6, [pd_16]
10862 psrld m6, 5
10863 packusdw m5, m6
10864
10865 movu m3, [r2]
10866 pshufb m3, m2
10867
10868 pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
10869 paddd m6, [pd_16]
10870 psrld m6, 5
10871
10872 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
10873 paddd m1, [pd_16]
10874 psrld m1, 5
10875 packusdw m6, m1
10876
10877 pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
10878 paddd m1, [pd_16]
10879 psrld m1, 5
10880
10881 packusdw m1, m1
10882 movhps m1, [r2] ; [00]
10883
10884 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
10885%endmacro
10886;------------------------------------------------------------------------------------------------------------------
10887; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10888;------------------------------------------------------------------------------------------------------------------
10889INIT_XMM sse4
10890cglobal intra_pred_ang32_13, 4,6,7,0-(5*mmsize+2)
10891 movu m0, [r2 + 0*mmsize]
10892 movu m1, [r2 + 1*mmsize]
10893 movu m2, [r2 + 2*mmsize]
10894 movu m3, [r2 + 3*mmsize]
10895 movu [rsp + 1*mmsize], m0
10896 movu [rsp + 2*mmsize], m1
10897 movu [rsp + 3*mmsize], m2
10898 movu [rsp + 4*mmsize], m3
10899
10900 mov r4w, [r2+64]
10901 mov [rsp+80], r4w
10902 movu m0, [r3 + 8]
10903 movu m1, [r3 + 36]
10904 pshufb m0, [shuf_mode_13_23]
10905 pshufb m1, [shuf_mode_13_23]
10906 movh [rsp + 8], m0
10907 movh [rsp], m1
10908 mov r4w, [r3+28]
10909 mov [rsp+8], r4w
10910 mov r4w, [r3+56]
10911 mov [rsp], r4w
10912
10913 lea r3, [ang_table + 16 * 16]
10914 mov r4d, 8
10915 mov r2, rsp
10916 add r1, r1
10917 lea r5, [r1 * 3]
10918 mova m2, [pw_punpcklwd]
10919
10920.loop:
10921 MODE_13_23 1
10922 lea r0, [r0 + r1 * 4 ]
10923 add r2, 8
10924 dec r4
10925 jnz .loop
10926 RET
10927
10928%macro MODE_14_22 1
10929 movu m3, [r2 + 24] ; [7 6 5 4 3 2 1 0]
10930 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
10931
10932 pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
10933 paddd m4, [pd_16]
10934 psrld m4, 5
10935
10936 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
10937 paddd m1, [pd_16]
10938 psrld m1, 5
10939 packusdw m4, m1
10940
10941 movu m3, [r2 + 22]
10942 pshufb m3, m2
10943
10944 pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
10945 paddd m5, [pd_16]
10946 psrld m5, 5
10947
10948 pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
10949 paddd m6, [pd_16]
10950 psrld m6, 5
10951 packusdw m5, m6
10952
10953 movu m3, [r2 + 20]
10954 pshufb m3, m2
10955
10956 pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
10957 paddd m6, [pd_16]
10958 psrld m6, 5
10959
10960 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
10961 paddd m1, [pd_16]
10962 psrld m1, 5
10963 packusdw m6, m1
10964
10965 pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
10966 paddd m1, [pd_16]
10967 psrld m1, 5
10968
10969 movu m3, [r2 + 18]
10970 pshufb m3, m2
10971
10972 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
10973 paddd m0, [pd_16]
10974 psrld m0, 5
10975 packusdw m1, m0
10976
10977 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10978
10979 pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
10980 paddd m4, [pd_16]
10981 psrld m4, 5
10982
10983 movu m3, [r2 + 16]
10984 pshufb m3, m2
10985
10986 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
10987 paddd m1, [pd_16]
10988 psrld m1, 5
10989 packusdw m4, m1
10990
10991 pmaddwd m5, m3, [r3 + 16] ; [17]
10992 paddd m5, [pd_16]
10993 psrld m5, 5
10994
10995 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
10996 paddd m6, [pd_16]
10997 psrld m6, 5
10998 packusdw m5, m6
10999
11000 movu m3, [r2 + 14]
11001 pshufb m3, m2
11002
11003 pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
11004 paddd m6, [pd_16]
11005 psrld m6, 5
11006
11007 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
11008 paddd m1, [pd_16]
11009 psrld m1, 5
11010 packusdw m6, m1
11011
11012 movu m3, [r2 + 12]
11013 pshufb m3, m2
11014
11015 pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
11016 paddd m1, [pd_16]
11017 psrld m1, 5
11018
11019 pmaddwd m0, m3, [r3] ; [16]
11020 paddd m0, [pd_16]
11021 psrld m0, 5
11022 packusdw m1, m0
11023
11024 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
11025
11026 pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
11027 paddd m4, [pd_16]
11028 psrld m4, 5
11029
11030 movu m3, [r2 + 10]
11031 pshufb m3, m2
11032
11033 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
11034 paddd m1, [pd_16]
11035 psrld m1, 5
11036 packusdw m4, m1
11037
11038 pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
11039 paddd m5, [pd_16]
11040 psrld m5, 5
11041
11042 movu m3, [r2 + 8]
11043 pshufb m3, m2
11044
11045 pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
11046 paddd m0, [pd_16]
11047 psrld m0, 5
11048 packusdw m5, m0
11049
11050 pmaddwd m6, m3, [r3 - 16] ; [15]
11051 paddd m6, [pd_16]
11052 psrld m6, 5
11053
11054 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
11055 paddd m1, [pd_16]
11056 psrld m1, 5
11057 packusdw m6, m1
11058
11059 movu m3, [r2 + 6]
11060 pshufb m3, m2
11061
11062 pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
11063 paddd m1, [pd_16]
11064 psrld m1, 5
11065
11066 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
11067 paddd m0, [pd_16]
11068 psrld m0, 5
11069 packusdw m1, m0
11070
11071 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
11072
11073 movu m3, [r2 + 4]
11074 pshufb m3, m2
11075
11076 pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
11077 paddd m4, [pd_16]
11078 psrld m4, 5
11079
11080 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
11081 paddd m1, [pd_16]
11082 psrld m1, 5
11083 packusdw m4, m1
11084
11085 pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
11086 paddd m5, [pd_16]
11087 psrld m5, 5
11088
11089 movu m3, [r2 + 2]
11090 pshufb m3, m2
11091
11092 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
11093 paddd m6, [pd_16]
11094 psrld m6, 5
11095 packusdw m5, m6
11096
11097 pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
11098 paddd m6, [pd_16]
11099 psrld m6, 5
11100
11101 movu m3, [r2]
11102 pshufb m3, m2
11103
11104 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
11105 paddd m1, [pd_16]
11106 psrld m1, 5
11107 packusdw m6, m1
11108
11109 pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
11110 paddd m1, [pd_16]
11111 psrld m1, 5
11112
11113 packusdw m1, m1
11114 movhps m1, [r2] ; [00]
11115
11116 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
11117%endmacro
11118;------------------------------------------------------------------------------------------------------------------
11119; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
11120;------------------------------------------------------------------------------------------------------------------
11121INIT_XMM sse4
11122cglobal intra_pred_ang32_14, 4,6,7,0-(5*mmsize+10)
11123 movu m0, [r2 + 0*mmsize]
11124 movu m1, [r2 + 1*mmsize]
11125 movu m2, [r2 + 2*mmsize]
11126 movu m3, [r2 + 3*mmsize]
11127 movu [rsp + 1*mmsize + 8], m0
11128 movu [rsp + 2*mmsize + 8], m1
11129 movu [rsp + 3*mmsize + 8], m2
11130 movu [rsp + 4*mmsize + 8], m3
11131
11132 mov r4w, [r2 + 64]
11133 mov [rsp + 88], r4w
11134 mov r4w, [r3+4]
11135 mov [rsp+22], r4w
11136 movu m0, [r3 + 10]
11137 movu m1, [r3 + 30]
11138 movu m2, [r3 + 50]
11139 pshufb m0, [shuf_mode_14_22]
11140 pshufb m1, [shuf_mode_14_22]
11141 pshufb m2, [shuf_mode_14_22]
11142 movh [rsp + 14], m0
11143 movh [rsp + 6], m1
11144 movh [rsp - 2], m2
11145
11146 lea r3, [ang_table + 16 * 16]
11147 mov r4d, 8
11148 mov r2, rsp
11149 add r1, r1
11150 lea r5, [r1 * 3]
11151 mova m2, [pw_punpcklwd]
11152
11153.loop:
11154 MODE_14_22 1
11155 lea r0, [r0 + r1 * 4 ]
11156 add r2, 8
11157 dec r4
11158 jnz .loop
11159 RET
11160
11161%macro MODE_15_21 1
11162 movu m3, [r2 + 32] ; [7 6 5 4 3 2 1 0]
11163 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
11164
11165 pmaddwd m4, m3, [r3 - 16] ; [15]
11166 paddd m4, [pd_16]
11167 psrld m4, 5
11168
11169 movu m3, [r2 + 30]
11170 pshufb m3, m2
11171
11172 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
11173 paddd m1, [pd_16]
11174 psrld m1, 5
11175 packusdw m4, m1
11176
11177 pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
11178 paddd m5, [pd_16]
11179 psrld m5, 5
11180
11181 movu m3, [r2 + 28]
11182 pshufb m3, m2
11183
11184 pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
11185 paddd m6, [pd_16]
11186 psrld m6, 5
11187 packusdw m5, m6
11188
11189 pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
11190 paddd m6, [pd_16]
11191 psrld m6, 5
11192
11193 movu m3, [r2 + 26]
11194 pshufb m3, m2
11195
11196 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
11197 paddd m1, [pd_16]
11198 psrld m1, 5
11199 packusdw m6, m1
11200
11201 pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
11202 paddd m1, [pd_16]
11203 psrld m1, 5
11204
11205 movu m3, [r2 + 24]
11206 pshufb m3, m2
11207
11208 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
11209 paddd m0, [pd_16]
11210 psrld m0, 5
11211 packusdw m1, m0
11212
11213 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
11214
11215 pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
11216 paddd m4, [pd_16]
11217 psrld m4, 5
11218
11219 movu m3, [r2 + 22]
11220 pshufb m3, m2
11221
11222 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
11223 paddd m1, [pd_16]
11224 psrld m1, 5
11225 packusdw m4, m1
11226
11227 pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
11228 paddd m5, [pd_16]
11229 psrld m5, 5
11230
11231 movu m3, [r2 + 20]
11232 pshufb m3, m2
11233
11234 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
11235 paddd m6, [pd_16]
11236 psrld m6, 5
11237 packusdw m5, m6
11238
11239 pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
11240 paddd m6, [pd_16]
11241 psrld m6, 5
11242
11243 movu m3, [r2 + 18]
11244 pshufb m3, m2
11245
11246 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
11247 paddd m1, [pd_16]
11248 psrld m1, 5
11249 packusdw m6, m1
11250
11251 pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
11252 paddd m1, [pd_16]
11253 psrld m1, 5
11254
11255 movu m3, [r2 + 16]
11256 pshufb m3, m2
11257
11258 pmaddwd m0, m3, [r3] ; [16]
11259 paddd m0, [pd_16]
11260 psrld m0, 5
11261 packusdw m1, m0
11262
11263 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
11264
11265 movu m3, [r2 + 14]
11266 pshufb m3, m2
11267
11268 pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
11269 paddd m4, [pd_16]
11270 psrld m4, 5
11271
11272 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
11273 paddd m1, [pd_16]
11274 psrld m1, 5
11275 packusdw m4, m1
11276
11277 movu m3, [r2 + 12]
11278 pshufb m3, m2
11279
11280 pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
11281 paddd m5, [pd_16]
11282 psrld m5, 5
11283
11284 pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
11285 paddd m0, [pd_16]
11286 psrld m0, 5
11287 packusdw m5, m0
11288
11289 movu m3, [r2 + 10]
11290 pshufb m3, m2
11291
11292 pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
11293 paddd m6, [pd_16]
11294 psrld m6, 5
11295
11296 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
11297 paddd m1, [pd_16]
11298 psrld m1, 5
11299 packusdw m6, m1
11300
11301 movu m3, [r2 + 8]
11302 pshufb m3, m2
11303
11304 pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
11305 paddd m1, [pd_16]
11306 psrld m1, 5
11307
11308 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
11309 paddd m0, [pd_16]
11310 psrld m0, 5
11311 packusdw m1, m0
11312
11313 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
11314
11315 movu m3, [r2 + 6]
11316 pshufb m3, m2
11317
11318 pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
11319 paddd m4, [pd_16]
11320 psrld m4, 5
11321
11322 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
11323 paddd m1, [pd_16]
11324 psrld m1, 5
11325 packusdw m4, m1
11326
11327 movu m3, [r2 + 4]
11328 pshufb m3, m2
11329
11330 pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
11331 paddd m5, [pd_16]
11332 psrld m5, 5
11333
11334 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
11335 paddd m6, [pd_16]
11336 psrld m6, 5
11337 packusdw m5, m6
11338
11339 movu m3, [r2 + 2]
11340 pshufb m3, m2
11341
11342 pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
11343 paddd m6, [pd_16]
11344 psrld m6, 5
11345
11346 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
11347 paddd m1, [pd_16]
11348 psrld m1, 5
11349 packusdw m6, m1
11350
11351 movu m3, [r2]
11352 pshufb m3, m2
11353
11354 pmaddwd m1, m3, [r3 + 16] ; [17]
11355 paddd m1, [pd_16]
11356 psrld m1, 5
11357
11358 packusdw m1, m1
11359 movhps m1, [r2] ; [00]
11360
11361 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
11362%endmacro
11363;------------------------------------------------------------------------------------------------------------------
11364; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
11365;------------------------------------------------------------------------------------------------------------------
11366INIT_XMM sse4
11367cglobal intra_pred_ang32_15, 4,6,7,0-(6*mmsize+2)
11368 movu m0, [r2 + 0*mmsize]
11369 movu m1, [r2 + 1*mmsize]
11370 movu m2, [r2 + 2*mmsize]
11371 movu m3, [r2 + 3*mmsize]
11372 movu [rsp + 2*mmsize], m0
11373 movu [rsp + 3*mmsize], m1
11374 movu [rsp + 4*mmsize], m2
11375 movu [rsp + 5*mmsize], m3
11376
11377 mov r4w, [r2 + 64]
11378 mov [rsp + 96], r4w
11379 movu m0, [r3 + 4]
11380 movu m1, [r3 + 18]
11381 movu m2, [r3 + 34]
11382 movu m3, [r3 + 48]
11383 pshufb m0, [shuf_mode_15_21]
11384 pshufb m1, [shuf_mode_15_21]
11385 pshufb m2, [shuf_mode_15_21]
11386 pshufb m3, [shuf_mode_15_21]
11387 movh [rsp + 24], m0
11388 movh [rsp + 16], m1
11389 movh [rsp + 8], m2
11390 movh [rsp], m3
11391
11392 lea r3, [ang_table + 16 * 16]
11393 mov r4d, 8
11394 mov r2, rsp
11395 add r1, r1
11396 lea r5, [r1 * 3]
11397 mova m2, [pw_punpcklwd]
11398
11399.loop:
11400 MODE_15_21 1
11401 lea r0, [r0 + r1 * 4 ]
11402 add r2, 8
11403 dec r4
11404 jnz .loop
11405 RET
11406
11407%macro MODE_16_20 1
11408 movu m3, [r2 + 40] ; [7 6 5 4 3 2 1 0]
11409 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
11410
11411 pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
11412 paddd m4, [pd_16]
11413 psrld m4, 5
11414
11415 movu m3, [r2 + 38]
11416 pshufb m3, m2
11417
11418 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
11419 paddd m1, [pd_16]
11420 psrld m1, 5
11421 packusdw m4, m1
11422
11423 pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
11424 paddd m5, [pd_16]
11425 psrld m5, 5
11426
11427 movu m3, [r2 + 36]
11428 pshufb m3, m2
11429
11430 pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
11431 paddd m6, [pd_16]
11432 psrld m6, 5
11433 packusdw m5, m6
11434
11435 movu m3, [r2 + 34]
11436 pshufb m3, m2
11437
11438 pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
11439 paddd m6, [pd_16]
11440 psrld m6, 5
11441
11442 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
11443 paddd m1, [pd_16]
11444 psrld m1, 5
11445 packusdw m6, m1
11446
11447 movu m3, [r2 + 32]
11448 pshufb m3, m2
11449
11450 pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
11451 paddd m1, [pd_16]
11452 psrld m1, 5
11453
11454 movu m3, [r2 + 30]
11455 pshufb m3, m2
11456
11457 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
11458 paddd m0, [pd_16]
11459 psrld m0, 5
11460 packusdw m1, m0
11461
11462 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
11463
11464 pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
11465 paddd m4, [pd_16]
11466 psrld m4, 5
11467
11468 movu m3, [r2 + 28]
11469 pshufb m3, m2
11470
11471 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
11472 paddd m1, [pd_16]
11473 psrld m1, 5
11474 packusdw m4, m1
11475
11476 movu m3, [r2 + 26]
11477 pshufb m3, m2
11478
11479 pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
11480 paddd m5, [pd_16]
11481 psrld m5, 5
11482
11483 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
11484 paddd m6, [pd_16]
11485 psrld m6, 5
11486 packusdw m5, m6
11487
11488 movu m3, [r2 + 24]
11489 pshufb m3, m2
11490
11491 pmaddwd m6, m3, [r3 - 16] ; [15]
11492 paddd m6, [pd_16]
11493 psrld m6, 5
11494
11495 movu m3, [r2 + 22]
11496 pshufb m3, m2
11497
11498 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
11499 paddd m1, [pd_16]
11500 psrld m1, 5
11501 packusdw m6, m1
11502
11503 pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
11504 paddd m1, [pd_16]
11505 psrld m1, 5
11506
11507 movu m3, [r2 + 20]
11508 pshufb m3, m2
11509
11510 pmaddwd m0, m3, [r3] ; [16]
11511 paddd m0, [pd_16]
11512 psrld m0, 5
11513 packusdw m1, m0
11514
11515 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
11516
11517 movu m3, [r2 + 18]
11518 pshufb m3, m2
11519
11520 pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
11521 paddd m4, [pd_16]
11522 psrld m4, 5
11523
11524 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
11525 paddd m1, [pd_16]
11526 psrld m1, 5
11527 packusdw m4, m1
11528
11529 movu m3, [r2 + 16]
11530 pshufb m3, m2
11531
11532 pmaddwd m5, m3, [r3 + 16] ; [17]
11533 paddd m5, [pd_16]
11534 psrld m5, 5
11535
11536 movu m3, [r2 + 14]
11537 pshufb m3, m2
11538
11539 pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
11540 paddd m0, [pd_16]
11541 psrld m0, 5
11542 packusdw m5, m0
11543
11544 pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
11545 paddd m6, [pd_16]
11546 psrld m6, 5
11547
11548 movu m3, [r2 + 12]
11549 pshufb m3, m2
11550
11551 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
11552 paddd m1, [pd_16]
11553 psrld m1, 5
11554 packusdw m6, m1
11555
11556 movu m3, [r2 + 10]
11557 pshufb m3, m2
11558
11559 pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
11560 paddd m1, [pd_16]
11561 psrld m1, 5
11562
11563 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
11564 paddd m0, [pd_16]
11565 psrld m0, 5
11566 packusdw m1, m0
11567
11568 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
11569
11570 movu m3, [r2 + 8]
11571 pshufb m3, m2
11572
11573 pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
11574 paddd m4, [pd_16]
11575 psrld m4, 5
11576
11577 movu m3, [r2 + 6]
11578 pshufb m3, m2
11579
11580 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
11581 paddd m1, [pd_16]
11582 psrld m1, 5
11583 packusdw m4, m1
11584
11585 pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
11586 paddd m5, [pd_16]
11587 psrld m5, 5
11588
11589 movu m3, [r2 + 4]
11590 pshufb m3, m2
11591
11592 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
11593 paddd m6, [pd_16]
11594 psrld m6, 5
11595 packusdw m5, m6
11596
11597 movu m3, [r2 + 2]
11598 pshufb m3, m2
11599
11600 pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
11601 paddd m6, [pd_16]
11602 psrld m6, 5
11603
11604 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
11605 paddd m1, [pd_16]
11606 psrld m1, 5
11607 packusdw m6, m1
11608
11609 movu m3, [r2]
11610 pshufb m3, m2
11611
11612 pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
11613 paddd m1, [pd_16]
11614 psrld m1, 5
11615
11616 packusdw m1, m1
11617 movhps m1, [r2] ; [00]
11618
11619 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
11620%endmacro
11621;------------------------------------------------------------------------------------------------------------------
11622; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
11623;------------------------------------------------------------------------------------------------------------------
11624INIT_XMM sse4
11625cglobal intra_pred_ang32_16, 4,6,7,0-(6*mmsize+10)
11626 movu m0, [r2 + 0*mmsize]
11627 movu m1, [r2 + 1*mmsize]
11628 movu m2, [r2 + 2*mmsize]
11629 movu m3, [r2 + 3*mmsize]
11630 movu [rsp + 2*mmsize + 8], m0
11631 movu [rsp + 3*mmsize + 8], m1
11632 movu [rsp + 4*mmsize + 8], m2
11633 movu [rsp + 5*mmsize + 8], m3
11634
11635 mov r4w, [r2 + 64]
11636 mov [rsp + 104], r4w
11637 movu m0, [r3 + 4]
11638 movu m1, [r3 + 22]
11639 movu m2, [r3 + 40]
11640 movd m3, [r3 + 58]
11641 pshufb m0, [shuf_mode_16_20]
11642 pshufb m1, [shuf_mode_16_20]
11643 pshufb m2, [shuf_mode_16_20]
11644 pshufb m3, [shuf_mode_16_20]
11645 movu [rsp + 24], m0
11646 movu [rsp + 12], m1
11647 movu [rsp], m2
11648 movd [rsp], m3
11649
11650 lea r3, [ang_table + 16 * 16]
11651 mov r4d, 8
11652 mov r2, rsp
11653 add r1, r1
11654 lea r5, [r1 * 3]
11655 mova m2, [pw_punpcklwd]
11656
11657.loop:
11658 MODE_16_20 1
11659 lea r0, [r0 + r1 * 4 ]
11660 add r2, 8
11661 dec r4
11662 jnz .loop
11663 RET
11664
11665%macro MODE_17_19 1
11666 movu m3, [r2 + 50] ; [7 6 5 4 3 2 1 0]
11667 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
11668
11669 pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
11670 paddd m4, [pd_16]
11671 psrld m4, 5
11672
11673 movu m3, [r2 + 48]
11674 pshufb m3, m2
11675
11676 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
11677 paddd m1, [pd_16]
11678 psrld m1, 5
11679 packusdw m4, m1
11680
11681 movu m3, [r2 + 46]
11682 pshufb m3, m2
11683
11684 pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
11685 paddd m5, [pd_16]
11686 psrld m5, 5
11687
11688 movu m3, [r2 + 44]
11689 pshufb m3, m2
11690
11691 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
11692 paddd m6, [pd_16]
11693 psrld m6, 5
11694 packusdw m5, m6
11695
11696 movu m3, [r2 + 42]
11697 pshufb m3, m2
11698
11699 pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
11700 paddd m6, [pd_16]
11701 psrld m6, 5
11702
11703 pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
11704 paddd m1, [pd_16]
11705 psrld m1, 5
11706 packusdw m6, m1
11707
11708 movu m3, [r2 + 40]
11709 pshufb m3, m2
11710
11711 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
11712 paddd m1, [pd_16]
11713 psrld m1, 5
11714
11715 movu m3, [r2 + 38]
11716 pshufb m3, m2
11717
11718 pmaddwd m0, m3, [r3] ; [16]
11719 paddd m0, [pd_16]
11720 psrld m0, 5
11721 packusdw m1, m0
11722
11723 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
11724
11725 movu m3, [r2 + 36]
11726 pshufb m3, m2
11727
11728 pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
11729 paddd m4, [pd_16]
11730 psrld m4, 5
11731
11732 movu m3, [r2 + 34]
11733 pshufb m3, m2
11734
11735 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
11736 paddd m1, [pd_16]
11737 psrld m1, 5
11738 packusdw m4, m1
11739
11740 pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
11741 paddd m5, [pd_16]
11742 psrld m5, 5
11743
11744 movu m3, [r2 + 32]
11745 pshufb m3, m2
11746
11747 pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
11748 paddd m6, [pd_16]
11749 psrld m6, 5
11750 packusdw m5, m6
11751
11752 movu m3, [r2 + 30]
11753 pshufb m3, m2
11754
11755 pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
11756 paddd m6, [pd_16]
11757 psrld m6, 5
11758
11759 movu m3, [r2 + 28]
11760 pshufb m3, m2
11761
11762 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
11763 paddd m1, [pd_16]
11764 psrld m1, 5
11765 packusdw m6, m1
11766
11767 movu m3, [r2 + 26]
11768 pshufb m3, m2
11769
11770 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
11771 paddd m1, [pd_16]
11772 psrld m1, 5
11773
11774 packusdw m1, m1
11775 movhps m1, [r2 + 26] ; [00]
11776
11777 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
11778
11779 movu m3, [r2 + 24]
11780 pshufb m3, m2
11781
11782 pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
11783 paddd m4, [pd_16]
11784 psrld m4, 5
11785
11786 movu m3, [r2 + 22]
11787 pshufb m3, m2
11788
11789 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
11790 paddd m1, [pd_16]
11791 psrld m1, 5
11792 packusdw m4, m1
11793
11794 movu m3, [r2 + 20]
11795 pshufb m3, m2
11796
11797 pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
11798 paddd m5, [pd_16]
11799 psrld m5, 5
11800
11801 movu m3, [r2 + 18]
11802 pshufb m3, m2
11803
11804 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
11805 paddd m0, [pd_16]
11806 psrld m0, 5
11807 packusdw m5, m0
11808
11809 movu m3, [r2 + 16]
11810 pshufb m3, m2
11811
11812 pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
11813 paddd m6, [pd_16]
11814 psrld m6, 5
11815
11816 pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
11817 paddd m1, [pd_16]
11818 psrld m1, 5
11819 packusdw m6, m1
11820
11821 movu m3, [r2 + 14]
11822 pshufb m3, m2
11823
11824 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
11825 paddd m1, [pd_16]
11826 psrld m1, 5
11827
11828 movu m3, [r2 + 12]
11829 pshufb m3, m2
11830
11831 pmaddwd m0, m3, [r3] ; [16]
11832 paddd m0, [pd_16]
11833 psrld m0, 5
11834 packusdw m1, m0
11835
11836 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
11837
11838 movu m3, [r2 + 10]
11839 pshufb m3, m2
11840
11841 pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
11842 paddd m4, [pd_16]
11843 psrld m4, 5
11844
11845 movu m3, [r2 + 8]
11846 pshufb m3, m2
11847
11848 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
11849 paddd m1, [pd_16]
11850 psrld m1, 5
11851 packusdw m4, m1
11852
11853 pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
11854 paddd m5, [pd_16]
11855 psrld m5, 5
11856
11857 movu m3, [r2 + 6]
11858 pshufb m3, m2
11859
11860 pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
11861 paddd m6, [pd_16]
11862 psrld m6, 5
11863 packusdw m5, m6
11864
11865 movu m3, [r2 + 4]
11866 pshufb m3, m2
11867
11868 pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
11869 paddd m6, [pd_16]
11870 psrld m6, 5
11871
11872 movu m3, [r2 + 2]
11873 pshufb m3, m2
11874
11875 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
11876 paddd m1, [pd_16]
11877 psrld m1, 5
11878 packusdw m6, m1
11879
11880 movu m3, [r2]
11881 pshufb m3, m2
11882
11883 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
11884 paddd m1, [pd_16]
11885 psrld m1, 5
11886
11887 packusdw m1, m1
11888 movhps m1, [r2] ; [00]
11889
11890 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
11891%endmacro
11892;------------------------------------------------------------------------------------------------------------------
11893; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
11894;------------------------------------------------------------------------------------------------------------------
11895INIT_XMM sse4
11896cglobal intra_pred_ang32_17, 4,6,7,0-(7*mmsize+4)
11897 movu m0, [r2 + 0*mmsize]
11898 movu m1, [r2 + 1*mmsize]
11899 movu m2, [r2 + 2*mmsize]
11900 movu m3, [r2 + 3*mmsize]
11901 movu [rsp + 3*mmsize + 2], m0
11902 movu [rsp + 4*mmsize + 2], m1
11903 movu [rsp + 5*mmsize + 2], m2
11904 movu [rsp + 6*mmsize + 2], m3
11905
11906 mov r4w, [r2 + 64]
11907 mov [rsp + 114], r4w
11908 movu m0, [r3 + 8]
11909 movu m1, [r3 + 30]
11910 movu m2, [r3 + 50]
11911 movd m3, [r3 + 2]
11912 pshufb m0, [shuf_mode_17_19]
11913 pshufb m1, [shuf_mode_17_19]
11914 pshufb m2, [shuf_mode_17_19]
11915 pshufb m3, [shuf_mode_16_20]
11916 movd [rsp + 46], m3
11917 movu [rsp + 30], m0
11918 movu [rsp + 12], m1
11919 movu [rsp - 4], m2
11920 mov r4w, [r3 + 24]
11921 mov [rsp + 30], r4w
11922 mov r4w, [r3 + 28]
11923 mov [rsp + 28], r4w
11924 mov r4w, [r3 + 46]
11925 mov [rsp + 12], r4w
11926
11927 lea r3, [ang_table + 16 * 16]
11928 mov r4d, 8
11929 mov r2, rsp
11930 add r1, r1
11931 lea r5, [r1 * 3]
11932 mova m2, [pw_punpcklwd]
11933
11934.loop:
11935 MODE_17_19 1
11936 lea r0, [r0 + r1 * 4 ]
11937 add r2, 8
11938 dec r4
11939 jnz .loop
11940 RET
11941
11942;-------------------------------------------------------------------------------------------------------------------
11943; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
11944;-------------------------------------------------------------------------------------------------------------------
11945INIT_XMM sse4
11946cglobal intra_pred_ang32_18, 4,7,8
11947 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
11948 movu m1, [r3 + 16] ; [15 14 13 12 11 10 9 8]
11949 movu m2, [r3 + 32] ; [23 22 21 20 19 18 17 16]
11950 movu m3, [r3 + 48] ; [31 30 29 28 27 26 25 24]
11951 movu m4, [r2 + 2] ; [8 7 6 5 4 3 2 1]
11952 movu m5, [r2 + 18] ; [16 15 14 13 12 11 10 9]
11953
11954 add r1, r1
11955 lea r6, [r1 * 2]
11956 lea r3, [r1 * 3]
11957 lea r4, [r1 * 4]
11958
11959 movu [r0], m0
11960 movu [r0 + 16], m1
11961 movu [r0 + 32], m2
11962 movu [r0 + 48], m3
11963
11964 pshufb m4, [shuf_mode32_18] ; [1 2 3 4 5 6 7 8]
11965 pshufb m5, [shuf_mode32_18] ; [9 10 11 12 13 14 15 16]
11966
11967 palignr m6, m0, m4, 14
11968 movu [r0 + r1], m6
11969 palignr m6, m1, m0, 14
11970 movu [r0 + r1 + 16], m6
11971 palignr m6, m2, m1, 14
11972 movu [r0 + r1 + 32], m6
11973 palignr m6, m3, m2, 14
11974 movu [r0 + r1 + 48], m6
11975
11976 palignr m6, m0, m4, 12
11977 movu [r0 + r6], m6
11978 palignr m6, m1, m0, 12
11979 movu [r0 + r6 + 16], m6
11980 palignr m6, m2, m1, 12
11981 movu [r0 + r6 + 32], m6
11982 palignr m6, m3, m2, 12
11983 movu [r0 + r6 + 48], m6
11984
11985 palignr m6, m0, m4, 10
11986 movu [r0 + r3], m6
11987 palignr m6, m1, m0, 10
11988 movu [r0 + r3 + 16], m6
11989 palignr m6, m2, m1, 10
11990 movu [r0 + r3 + 32], m6
11991 palignr m6, m3, m2, 10
11992 movu [r0 + r3 + 48], m6
11993
11994 add r0, r4
11995
11996 palignr m6, m0, m4, 8
11997 movu [r0], m6
11998 palignr m6, m1, m0, 8
11999 movu [r0 + 16], m6
12000 palignr m6, m2, m1, 8
12001 movu [r0 + 32], m6
12002 palignr m6, m3, m2, 8
12003 movu [r0 + 48], m6
12004
12005 palignr m6, m0, m4, 6
12006 movu [r0 + r1], m6
12007 palignr m6, m1, m0, 6
12008 movu [r0 + r1 + 16], m6
12009 palignr m6, m2, m1, 6
12010 movu [r0 + r1 + 32], m6
12011 palignr m6, m3, m2, 6
12012 movu [r0 + r1 + 48], m6
12013
12014 palignr m6, m0, m4, 4
12015 movu [r0 + r6], m6
12016 palignr m6, m1, m0, 4
12017 movu [r0 + r6 + 16], m6
12018 palignr m6, m2, m1, 4
12019 movu [r0 + r6 + 32], m6
12020 palignr m6, m3, m2, 4
12021 movu [r0 + r6 + 48], m6
12022
12023 palignr m6, m0, m4, 2
12024 movu [r0 + r3], m6
12025 palignr m6, m1, m0, 2
12026 movu [r0 + r3 + 16], m6
12027 palignr m6, m2, m1, 2
12028 movu [r0 + r3 + 32], m6
12029 palignr m6, m3, m2, 2
12030 movu [r0 + r3 + 48], m6
12031
12032 add r0, r4
12033
12034 movu [r0], m4
12035 movu [r0 + 16], m0
12036 movu [r0 + 32], m1
12037 movu [r0 + 48], m2
12038
12039 palignr m6, m4, m5, 14
12040 movu [r0 + r1], m6
12041 palignr m6, m0, m4, 14
12042 movu [r0 + r1 + 16], m6
12043 palignr m6, m1, m0, 14
12044 movu [r0 + r1 + 32], m6
12045 palignr m6, m2, m1, 14
12046 movu [r0 + r1 + 48], m6
12047
12048 palignr m6, m4, m5, 12
12049 movu [r0 + r6], m6
12050 palignr m6, m0, m4, 12
12051 movu [r0 + r6 + 16], m6
12052 palignr m6, m1, m0, 12
12053 movu [r0 + r6 + 32], m6
12054 palignr m6, m2, m1, 12
12055 movu [r0 + r6 + 48], m6
12056
12057 palignr m6, m4, m5, 10
12058 movu [r0 + r3], m6
12059 palignr m6, m0, m4, 10
12060 movu [r0 + r3 + 16], m6
12061 palignr m6, m1, m0, 10
12062 movu [r0 + r3 + 32], m6
12063 palignr m6, m2, m1, 10
12064 movu [r0 + r3 + 48], m6
12065
12066 add r0, r4
12067
12068 palignr m6, m4, m5, 8
12069 movu [r0], m6
12070 palignr m6, m0, m4, 8
12071 movu [r0 + 16], m6
12072 palignr m6, m1, m0, 8
12073 movu [r0 + 32], m6
12074 palignr m6, m2, m1, 8
12075 movu [r0 + 48], m6
12076
12077 palignr m6, m4, m5, 6
12078 movu [r0 + r1], m6
12079 palignr m6, m0, m4, 6
12080 movu [r0 + r1 + 16], m6
12081 palignr m6, m1, m0, 6
12082 movu [r0 + r1 + 32], m6
12083 palignr m6, m2, m1, 6
12084 movu [r0 + r1 + 48], m6
12085
12086 palignr m6, m4, m5, 4
12087 movu [r0 + r6], m6
12088 palignr m6, m0, m4, 4
12089 movu [r0 + r6 + 16], m6
12090 palignr m6, m1, m0, 4
12091 movu [r0 + r6 + 32], m6
12092 palignr m6, m2, m1, 4
12093 movu [r0 + r6 + 48], m6
12094
12095 palignr m6, m4, m5, 2
12096 movu [r0 + r3], m6
12097 palignr m6, m0, m4, 2
12098 movu [r0 + r3 + 16], m6
12099 palignr m6, m1, m0, 2
12100 movu [r0 + r3 + 32], m6
12101 palignr m6, m2, m1, 2
12102 movu [r0 + r3 + 48], m6
12103
12104 add r0, r4
12105
12106 movu m2, [r2 + 34]
12107 movu m3, [r2 + 50]
12108 pshufb m2, [shuf_mode32_18]
12109 pshufb m3, [shuf_mode32_18]
12110
12111 movu [r0], m5
12112 movu [r0 + 16], m4
12113 movu [r0 + 32], m0
12114 movu [r0 + 48], m1
12115
12116 palignr m6, m5, m2, 14
12117 movu [r0 + r1], m6
12118 palignr m6, m4, m5, 14
12119 movu [r0 + r1 + 16], m6
12120 palignr m6, m0, m4, 14
12121 movu [r0 + r1 + 32], m6
12122 palignr m6, m1, m0, 14
12123 movu [r0 + r1 + 48], m6
12124
12125 palignr m6, m5, m2, 12
12126 movu [r0 + r6], m6
12127 palignr m6, m4, m5, 12
12128 movu [r0 + r6 + 16], m6
12129 palignr m6, m0, m4, 12
12130 movu [r0 + r6 + 32], m6
12131 palignr m6, m1, m0, 12
12132 movu [r0 + r6 + 48], m6
12133
12134 palignr m6, m5, m2, 10
12135 movu [r0 + r3], m6
12136 palignr m6, m4, m5, 10
12137 movu [r0 + r3 + 16], m6
12138 palignr m6, m0, m4, 10
12139 movu [r0 + r3 + 32], m6
12140 palignr m6, m1, m0, 10
12141 movu [r0 + r3 + 48], m6
12142
12143 add r0, r4
12144
12145 palignr m6, m5, m2, 8
12146 movu [r0], m6
12147 palignr m6, m4, m5, 8
12148 movu [r0 + 16], m6
12149 palignr m6, m0, m4, 8
12150 movu [r0 + 32], m6
12151 palignr m6, m1, m0, 8
12152 movu [r0 + 48], m6
12153
12154 palignr m6, m5, m2, 6
12155 movu [r0 + r1], m6
12156 palignr m6, m4, m5, 6
12157 movu [r0 + r1 + 16], m6
12158 palignr m6, m0, m4, 6
12159 movu [r0 + r1 + 32], m6
12160 palignr m6, m1, m0, 6
12161 movu [r0 + r1 + 48], m6
12162
12163 palignr m6, m5, m2, 4
12164 movu [r0 + r6], m6
12165 palignr m6, m4, m5, 4
12166 movu [r0 + r6 + 16], m6
12167 palignr m6, m0, m4, 4
12168 movu [r0 + r6 + 32], m6
12169 palignr m6, m1, m0, 4
12170 movu [r0 + r6 + 48], m6
12171
12172 palignr m6, m5, m2, 2
12173 movu [r0 + r3], m6
12174 palignr m6, m4, m5, 2
12175 movu [r0 + r3 + 16], m6
12176 palignr m6, m0, m4, 2
12177 movu [r0 + r3 + 32], m6
12178 palignr m6, m1, m0, 2
12179 movu [r0 + r3 + 48], m6
12180
12181 add r0, r4
12182
12183 movu [r0], m2
12184 movu [r0 + 16], m5
12185 movu [r0 + 32], m4
12186 movu [r0 + 48], m0
12187
12188 palignr m6, m2, m3, 14
12189 movu [r0 + r1], m6
12190 palignr m6, m5, m2, 14
12191 movu [r0 + r1 + 16], m6
12192 palignr m6, m4, m5, 14
12193 movu [r0 + r1 + 32], m6
12194 palignr m6, m0, m4, 14
12195 movu [r0 + r1 + 48], m6
12196
12197 palignr m6, m2, m3, 12
12198 movu [r0 + r6], m6
12199 palignr m6, m5, m2, 12
12200 movu [r0 + r6 + 16], m6
12201 palignr m6, m4, m5, 12
12202 movu [r0 + r6 + 32], m6
12203 palignr m6, m0, m4, 12
12204 movu [r0 + r6 + 48], m6
12205
12206 palignr m6, m2, m3, 10
12207 movu [r0 + r3], m6
12208 palignr m6, m5, m2, 10
12209 movu [r0 + r3 + 16], m6
12210 palignr m6, m4, m5, 10
12211 movu [r0 + r3 + 32], m6
12212 palignr m6, m0, m4, 10
12213 movu [r0 + r3 + 48], m6
12214
12215 add r0, r4
12216
12217 palignr m6, m2, m3, 8
12218 movu [r0], m6
12219 palignr m6, m5, m2, 8
12220 movu [r0 + 16], m6
12221 palignr m6, m4, m5, 8
12222 movu [r0 + 32], m6
12223 palignr m6, m0, m4, 8
12224 movu [r0 + 48], m6
12225
12226 palignr m6, m2, m3, 6
12227 movu [r0 + r1], m6
12228 palignr m6, m5, m2, 6
12229 movu [r0 + r1 + 16], m6
12230 palignr m6, m4, m5, 6
12231 movu [r0 + r1 + 32], m6
12232 palignr m6, m0, m4, 6
12233 movu [r0 + r1 + 48], m6
12234
12235 palignr m6, m2, m3, 4
12236 movu [r0 + r6], m6
12237 palignr m6, m5, m2, 4
12238 movu [r0 + r6 + 16], m6
12239 palignr m6, m4, m5, 4
12240 movu [r0 + r6 + 32], m6
12241 palignr m6, m0, m4, 4
12242 movu [r0 + r6 + 48], m6
12243
12244 palignr m6, m2, m3, 2
12245 movu [r0 + r3], m6
12246 palignr m6, m5, m2, 2
12247 movu [r0 + r3 + 16], m6
12248 palignr m6, m4, m5, 2
12249 movu [r0 + r3 + 32], m6
12250 palignr m6, m0, m4, 2
12251 movu [r0 + r3 + 48], m6
12252 RET
12253
12254;------------------------------------------------------------------------------------------------------------------
12255; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12256;------------------------------------------------------------------------------------------------------------------
12257INIT_XMM sse4
12258cglobal intra_pred_ang32_19, 4,7,7,0-(7*mmsize+4)
12259 xchg r2, r3
12260 movu m0, [r2 + 0*mmsize]
12261 movu m1, [r2 + 1*mmsize]
12262 movu m2, [r2 + 2*mmsize]
12263 movu m3, [r2 + 3*mmsize]
12264 movu [rsp + 3*mmsize + 2], m0
12265 movu [rsp + 4*mmsize + 2], m1
12266 movu [rsp + 5*mmsize + 2], m2
12267 movu [rsp + 6*mmsize + 2], m3
12268
12269 mov r4w, [r2 + 64]
12270 mov [rsp + 114], r4w
12271 movu m0, [r3 + 8]
12272 movu m1, [r3 + 30]
12273 movu m2, [r3 + 50]
12274 movd m3, [r3 + 2]
12275 pshufb m0, [shuf_mode_17_19]
12276 pshufb m1, [shuf_mode_17_19]
12277 pshufb m2, [shuf_mode_17_19]
12278 pshufb m3, [shuf_mode_16_20]
12279 movd [rsp + 46], m3
12280 movu [rsp + 30], m0
12281 movu [rsp + 12], m1
12282 movu [rsp - 4], m2
12283 mov r4w, [r3 + 24]
12284 mov [rsp + 30], r4w
12285 mov r4w, [r3 + 28]
12286 mov [rsp + 28], r4w
12287 mov r4w, [r3 + 46]
12288 mov [rsp + 12], r4w
12289
12290 lea r3, [ang_table + 16 * 16]
12291 mov r4d, 8
12292 mov r2, rsp
12293 add r1, r1
12294 lea r5, [r1 * 3]
12295 mova m2, [pw_punpcklwd]
12296 mov r6, r0
12297
12298.loop:
12299 MODE_17_19 0
12300 add r6, 8
12301 mov r0, r6
12302 add r2, 8
12303 dec r4
12304 jnz .loop
12305 RET
12306
12307;------------------------------------------------------------------------------------------------------------------
12308; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12309;------------------------------------------------------------------------------------------------------------------
12310INIT_XMM sse4
12311cglobal intra_pred_ang32_20, 4,7,7,0-(6*mmsize+10)
12312 xchg r2, r3
12313 movu m0, [r2 + 0*mmsize]
12314 movu m1, [r2 + 1*mmsize]
12315 movu m2, [r2 + 2*mmsize]
12316 movu m3, [r2 + 3*mmsize]
12317 movu [rsp + 2*mmsize + 8], m0
12318 movu [rsp + 3*mmsize + 8], m1
12319 movu [rsp + 4*mmsize + 8], m2
12320 movu [rsp + 5*mmsize + 8], m3
12321
12322 mov r4w, [r2 + 64]
12323 mov [rsp + 104], r4w
12324 movu m0, [r3 + 4]
12325 movu m1, [r3 + 22]
12326 movu m2, [r3 + 40]
12327 movd m3, [r3 + 58]
12328 pshufb m0, [shuf_mode_16_20]
12329 pshufb m1, [shuf_mode_16_20]
12330 pshufb m2, [shuf_mode_16_20]
12331 pshufb m3, [shuf_mode_16_20]
12332 movu [rsp + 24], m0
12333 movu [rsp + 12], m1
12334 movu [rsp], m2
12335 movd [rsp], m3
12336
12337 lea r3, [ang_table + 16 * 16]
12338 mov r4d, 8
12339 mov r2, rsp
12340 add r1, r1
12341 lea r5, [r1 * 3]
12342 mova m2, [pw_punpcklwd]
12343 mov r6, r0
12344
12345.loop:
12346 MODE_16_20 0
12347 add r6, 8
12348 mov r0, r6
12349 add r2, 8
12350 dec r4
12351 jnz .loop
12352 RET
12353
12354;------------------------------------------------------------------------------------------------------------------
12355; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12356;------------------------------------------------------------------------------------------------------------------
12357INIT_XMM sse4
12358cglobal intra_pred_ang32_21, 4,7,7,0-(6*mmsize+2)
12359 xchg r2, r3
12360 movu m0, [r2 + 0*mmsize]
12361 movu m1, [r2 + 1*mmsize]
12362 movu m2, [r2 + 2*mmsize]
12363 movu m3, [r2 + 3*mmsize]
12364 movu [rsp + 2*mmsize], m0
12365 movu [rsp + 3*mmsize], m1
12366 movu [rsp + 4*mmsize], m2
12367 movu [rsp + 5*mmsize], m3
12368
12369 mov r4w, [r2 + 64]
12370 mov [rsp + 96], r4w
12371 movu m0, [r3 + 4]
12372 movu m1, [r3 + 18]
12373 movu m2, [r3 + 34]
12374 movu m3, [r3 + 48]
12375 pshufb m0, [shuf_mode_15_21]
12376 pshufb m1, [shuf_mode_15_21]
12377 pshufb m2, [shuf_mode_15_21]
12378 pshufb m3, [shuf_mode_15_21]
12379 movh [rsp + 24], m0
12380 movh [rsp + 16], m1
12381 movh [rsp + 8], m2
12382 movh [rsp], m3
12383
12384 lea r3, [ang_table + 16 * 16]
12385 mov r4d, 8
12386 mov r2, rsp
12387 add r1, r1
12388 lea r5, [r1 * 3]
12389 mova m2, [pw_punpcklwd]
12390 mov r6, r0
12391
12392.loop:
12393 MODE_15_21 0
12394 add r6, 8
12395 mov r0, r6
12396 add r2, 8
12397 dec r4
12398 jnz .loop
12399 RET
12400
12401;------------------------------------------------------------------------------------------------------------------
12402; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12403;------------------------------------------------------------------------------------------------------------------
12404INIT_XMM sse4
12405cglobal intra_pred_ang32_22, 4,7,7,0-(5*mmsize+10)
12406 xchg r2, r3
12407 movu m0, [r2 + 0*mmsize]
12408 movu m1, [r2 + 1*mmsize]
12409 movu m2, [r2 + 2*mmsize]
12410 movu m3, [r2 + 3*mmsize]
12411 movu [rsp + 1*mmsize + 8], m0
12412 movu [rsp + 2*mmsize + 8], m1
12413 movu [rsp + 3*mmsize + 8], m2
12414 movu [rsp + 4*mmsize + 8], m3
12415
12416 mov r4w, [r2 + 64]
12417 mov [rsp + 88], r4w
12418 mov r4w, [r3+4]
12419 mov [rsp+22], r4w
12420 movu m0, [r3 + 10]
12421 movu m1, [r3 + 30]
12422 movu m2, [r3 + 50]
12423 pshufb m0, [shuf_mode_14_22]
12424 pshufb m1, [shuf_mode_14_22]
12425 pshufb m2, [shuf_mode_14_22]
12426 movh [rsp + 14], m0
12427 movh [rsp + 6], m1
12428 movh [rsp - 2], m2
12429
12430 lea r3, [ang_table + 16 * 16]
12431 mov r4d, 8
12432 mov r2, rsp
12433 add r1, r1
12434 lea r5, [r1 * 3]
12435 mova m2, [pw_punpcklwd]
12436 mov r6, r0
12437
12438.loop:
12439 MODE_14_22 0
12440 add r6, 8
12441 mov r0, r6
12442 add r2, 8
12443 dec r4
12444 jnz .loop
12445 RET
12446
12447;------------------------------------------------------------------------------------------------------------------
12448; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12449;------------------------------------------------------------------------------------------------------------------
12450INIT_XMM sse4
12451cglobal intra_pred_ang32_23, 4,7,7,0-(5*mmsize+2)
12452 xchg r2, r3
12453 movu m0, [r2 + 0*mmsize]
12454 movu m1, [r2 + 1*mmsize]
12455 movu m2, [r2 + 2*mmsize]
12456 movu m3, [r2 + 3*mmsize]
12457 movu [rsp + 1*mmsize], m0
12458 movu [rsp + 2*mmsize], m1
12459 movu [rsp + 3*mmsize], m2
12460 movu [rsp + 4*mmsize], m3
12461
12462 mov r4w, [r2+64]
12463 mov [rsp+80], r4w
12464 movu m0, [r3 + 8]
12465 movu m1, [r3 + 36]
12466 pshufb m0, [shuf_mode_13_23]
12467 pshufb m1, [shuf_mode_13_23]
12468 movh [rsp + 8], m0
12469 movh [rsp], m1
12470 mov r4w, [r3+28]
12471 mov [rsp+8], r4w
12472 mov r4w, [r3+56]
12473 mov [rsp], r4w
12474
12475 lea r3, [ang_table + 16 * 16]
12476 mov r4d, 8
12477 mov r2, rsp
12478 add r1, r1
12479 lea r5, [r1 * 3]
12480 mova m2, [pw_punpcklwd]
12481 mov r6, r0
12482
12483.loop:
12484 MODE_13_23 0
12485 add r6, 8
12486 mov r0, r6
12487 add r2, 8
12488 dec r4
12489 jnz .loop
12490 RET
12491
12492;------------------------------------------------------------------------------------------------------------------
12493; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12494;------------------------------------------------------------------------------------------------------------------
12495INIT_XMM sse4
12496cglobal intra_pred_ang32_24, 4,7,7,0-(4*mmsize+10)
12497 xchg r2, r3
12498 movu m0, [r2 + 0*mmsize]
12499 movu m1, [r2 + 1*mmsize]
12500 movu m2, [r2 + 2*mmsize]
12501 movu m3, [r2 + 3*mmsize]
12502
12503 movu [rsp + 0*mmsize + 8], m0
12504 movu [rsp + 1*mmsize + 8], m1
12505 movu [rsp + 2*mmsize + 8], m2
12506 movu [rsp + 3*mmsize + 8], m3
12507
12508 mov r4w, [r2+64]
12509 mov [rsp+72], r4w
12510 mov r4w, [r3+12]
12511 mov [rsp+6], r4w
12512 mov r4w, [r3+26]
12513 mov [rsp+4], r4w
12514 mov r4w, [r3+38]
12515 mov [rsp+2], r4w
12516 mov r4w, [r3+52]
12517 mov [rsp], r4w
12518
12519 lea r3, [ang_table + 16 * 16]
12520 mov r4d, 8
12521 mov r2, rsp
12522 add r1, r1
12523 lea r5, [r1 * 3]
12524 mov r6, r0
12525 mova m2, [pw_punpcklwd]
12526
12527.loop:
12528 MODE_12_24 0
12529 add r6, 8
12530 mov r0, r6
12531 add r2, 8
12532 dec r4
12533 jnz .loop
12534 RET
12535
12536;------------------------------------------------------------------------------------------------------------------
12537; void intraPredAng32_25(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12538;------------------------------------------------------------------------------------------------------------------
12539INIT_XMM sse4
12540cglobal intra_pred_ang32_25, 4,7,7,0-(4*mmsize+4)
12541 xchg r2, r3
12542 movu m0, [r2 + 0*mmsize]
12543 movu m1, [r2 + 1*mmsize]
12544 movu m2, [r2 + 2*mmsize]
12545 movu m3, [r2 + 3*mmsize]
12546 movu [rsp + 0*mmsize + 2], m0
12547 movu [rsp + 1*mmsize + 2], m1
12548 movu [rsp + 2*mmsize + 2], m2
12549 movu [rsp + 3*mmsize + 2], m3
12550 mov r4w, [r3+32]
12551 mov [rsp], r4w
12552 mov r4w, [r2+64]
12553 mov [rsp+66], r4w
12554
12555 lea r3, [ang_table + 16 * 16]
12556 mov r4d, 8
12557 mov r2, rsp
12558 add r1, r1
12559 lea r5, [r1 * 3]
12560 mov r6, r0
12561
12562.loop:
12563 MODE_11_25 0
12564 add r6, 8
12565 mov r0, r6
12566 add r2, 8
12567 dec r4
12568 jnz .loop
12569 RET
12570
12571;------------------------------------------------------------------------------------------------------------------
12572; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12573;------------------------------------------------------------------------------------------------------------------
12574INIT_XMM sse4
12575cglobal intra_pred_ang32_26, 4,7,5
12576 mov r6d, 4
12577 add r1, r1
12578 lea r2, [r1 * 2]
12579 lea r4, [r1 * 3]
12580 lea r5, [r1 * 4]
12581 mova m4, [c_mode32_10_0]
12582
12583 movu m0, [r3 + 2]
12584 movu m1, [r3 + 18]
12585 movu m2, [r3 + 34]
12586 movu m3, [r3 + 50]
12587
12588.loop:
12589 movu [r0], m0
12590 movu [r0 + 16], m1
12591 movu [r0 + 32], m2
12592 movu [r0 + 48], m3
12593
12594 movu [r0 + r1], m0
12595 movu [r0 + r1 + 16], m1
12596 movu [r0 + r1 + 32], m2
12597 movu [r0 + r1 + 48], m3
12598
12599 movu [r0 + r2], m0
12600 movu [r0 + r2 + 16], m1
12601 movu [r0 + r2 + 32], m2
12602 movu [r0 + r2 + 48], m3
12603
12604 movu [r0 + r4], m0
12605 movu [r0 + r4 + 16], m1
12606 movu [r0 + r4 + 32], m2
12607 movu [r0 + r4 + 48], m3
12608
12609 add r0, r5
12610
12611 movu [r0], m0
12612 movu [r0 + 16], m1
12613 movu [r0 + 32], m2
12614 movu [r0 + 48], m3
12615
12616 movu [r0 + r1], m0
12617 movu [r0 + r1 + 16], m1
12618 movu [r0 + r1 + 32], m2
12619 movu [r0 + r1 + 48], m3
12620
12621 movu [r0 + r2], m0
12622 movu [r0 + r2 + 16], m1
12623 movu [r0 + r2 + 32], m2
12624 movu [r0 + r2 + 48], m3
12625
12626 movu [r0 + r4], m0
12627 movu [r0 + r4 + 16], m1
12628 movu [r0 + r4 + 32], m2
12629 movu [r0 + r4 + 48], m3
12630
12631 add r0, r5
12632 dec r6d
12633 jnz .loop
12634 RET
12635
12636;------------------------------------------------------------------------------------------------------------------
12637; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12638;------------------------------------------------------------------------------------------------------------------
12639INIT_XMM sse4
12640cglobal intra_pred_ang32_27, 4,7,8
12641 xchg r2, r3mp
12642 lea r3, [ang_table + 16 * 16]
12643 add r1, r1
12644 lea r5, [r1 * 3]
12645 mov r6, r0
12646 mov r4d, 8
12647
12648.loop:
12649 MODE_9_27 0
12650 add r6, 8
12651 mov r0, r6
12652 add r2, 8
12653 dec r4
12654 jnz .loop
12655 RET
12656
12657;------------------------------------------------------------------------------------------------------------------
12658; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12659;------------------------------------------------------------------------------------------------------------------
12660INIT_XMM sse4
12661cglobal intra_pred_ang32_28, 4,7,8
12662 xchg r2, r3mp
12663 lea r3, [ang_table + 16 * 16]
12664 add r1, r1
12665 lea r5, [r1 * 3]
12666 mov r6, r0
12667 mov r4d, 8
12668
12669.loop:
12670 MODE_8_28 0
12671 add r6, 8
12672 mov r0, r6
12673 add r2, 8
12674 dec r4
12675 jnz .loop
12676 RET
12677
12678;------------------------------------------------------------------------------------------------------------------
12679; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12680;------------------------------------------------------------------------------------------------------------------
12681INIT_XMM sse4
12682cglobal intra_pred_ang32_29, 4,7,8
12683 xchg r2, r3mp
12684 lea r3, [ang_table + 16 * 16]
12685 add r1, r1
12686 lea r5, [r1 * 3]
12687 mov r6, r0
12688 mov r4d, 8
12689
12690.loop:
12691 MODE_7_29 0
12692 add r6, 8
12693 mov r0, r6
12694 add r2, 8
12695 dec r4
12696 jnz .loop
12697 RET
12698
12699;------------------------------------------------------------------------------------------------------------------
12700; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12701;------------------------------------------------------------------------------------------------------------------
12702INIT_XMM sse4
12703cglobal intra_pred_ang32_30, 4,7,8
12704 xchg r2, r3mp
12705 lea r3, [ang_table + 16 * 16]
12706 add r1, r1
12707 lea r5, [r1 * 3]
12708 mov r6, r0
12709 mov r4d, 8
12710
12711.loop:
12712 MODE_6_30 0
12713 add r6, 8
12714 mov r0, r6
12715 add r2, 8
12716 dec r4
12717 jnz .loop
12718 RET
12719
12720;------------------------------------------------------------------------------------------------------------------
12721; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12722;------------------------------------------------------------------------------------------------------------------
12723INIT_XMM sse4
12724cglobal intra_pred_ang32_31, 4,7,8
12725 xchg r2, r3mp
12726 lea r3, [ang_table + 16 * 16]
12727 add r1, r1
12728 lea r5, [r1 * 3]
12729 mov r6, r0
12730 mov r4d, 8
12731
12732.loop:
12733 MODE_5_31 0
12734 add r6, 8
12735 mov r0, r6
12736 add r2, 8
12737 dec r4
12738 jnz .loop
12739 RET
12740
12741;------------------------------------------------------------------------------------------------------------------
12742; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12743;------------------------------------------------------------------------------------------------------------------
12744INIT_XMM sse4
12745cglobal intra_pred_ang32_32, 4,7,8
12746 xchg r2, r3mp
12747 lea r3, [ang_table + 16 * 16]
12748 add r1, r1
12749 lea r5, [r1 * 3]
12750 mov r6, r0
12751 mov r4d, 8
12752
12753.loop:
12754 MODE_4_32 0
12755 add r6, 8
12756 mov r0, r6
12757 add r2, 8
12758 dec r4
12759 jnz .loop
12760 RET
12761
12762;------------------------------------------------------------------------------------------------------------------
12763; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12764;------------------------------------------------------------------------------------------------------------------
12765INIT_XMM sse4
12766cglobal intra_pred_ang32_33, 4,7,8
12767 xchg r2, r3mp
12768 lea r3, [ang_table + 16 * 16]
12769 add r1, r1
12770 lea r5, [r1 * 3]
12771 mov r6, r0
12772 mov r4d, 8
12773.loop:
12774 MODE_3_33 0
12775 add r6, 8
12776 mov r0, r6
12777 add r2, 8
12778 dec r4
12779 jnz .loop
12780 RET